Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
856d90400a chore(uv): add --index-strategy=unsafe-first-match to l4t
This is because the main index might not contain all the dependencies
for torch

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-01-08 21:47:18 +00:00
38 changed files with 184 additions and 3477 deletions

View File

@@ -16,7 +16,7 @@ RUN apt-get update && \
# The requirements-drivers target is for BUILD_TYPE specific items. If you need to install something specific to CUDA, or specific to ROCM, it goes here.
FROM requirements AS requirements-drivers
ARG VULKAN_FROM_SOURCE=false
ARG BUILD_TYPE
ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=0
@@ -41,7 +41,7 @@ RUN <<EOT bash
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils mesa-vulkan-drivers
if [ "amd64" = "$TARGETARCH" ] && [ "${VULKAN_FROM_SOURCE}" = "true" ]; then
if [ "amd64" = "$TARGETARCH" ]; then
wget "https://sdk.lunarg.com/sdk/download/1.4.328.1/linux/vulkansdk-linux-x86_64-1.4.328.1.tar.xz" && \
tar -xf vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
rm vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
@@ -59,11 +59,6 @@ RUN <<EOT bash
cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/include/* /usr/include/ && \
cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/share/* /usr/share/ && \
rm -rf /opt/vulkan-sdk
elif [ "amd64" = "${TARGETARCH}}" ]; then
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list http://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
apt-get update && \
apt-get install -y vulkan-sdk
fi
if [ "arm64" = "$TARGETARCH" ]; then
mkdir vulkan && cd vulkan && \
@@ -111,7 +106,7 @@ RUN <<EOT bash
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
if [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi

View File

@@ -46,7 +46,7 @@ RUN <<EOT bash
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
if [ "amd64" = "$TARGETARCH" ] && [ "${VULKAN_FROM_SOURCE}" = "true" ]; then
if [ "amd64" = "$TARGETARCH" ]; then
wget "https://sdk.lunarg.com/sdk/download/1.4.328.1/linux/vulkansdk-linux-x86_64-1.4.328.1.tar.xz" && \
tar -xf vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
rm vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
@@ -64,11 +64,6 @@ RUN <<EOT bash
cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/include/* /usr/include/ && \
cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/share/* /usr/share/ && \
rm -rf /opt/vulkan-sdk
elif [ "amd64" = "${TARGETARCH}}" ]; then
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list http://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
apt-get update && \
apt-get install -y vulkan-sdk
fi
if [ "arm64" = "$TARGETARCH" ]; then
mkdir vulkan && cd vulkan && \
@@ -99,11 +94,7 @@ RUN <<EOT bash
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
fi
if [ "arm64" = "$TARGETARCH" ]; then
if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
else
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
fi
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
fi
dpkg -i cuda-keyring_1.1-1_all.deb && \
rm -f cuda-keyring_1.1-1_all.deb && \
@@ -115,7 +106,7 @@ RUN <<EOT bash
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
if [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi

View File

@@ -103,7 +103,7 @@ RUN <<EOT bash
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
if [ "amd64" = "$TARGETARCH" ] && [ "${VULKAN_FROM_SOURCE}" = "true" ]; then
if [ "amd64" = "$TARGETARCH" ]; then
wget "https://sdk.lunarg.com/sdk/download/1.4.328.1/linux/vulkansdk-linux-x86_64-1.4.328.1.tar.xz" && \
tar -xf vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
rm vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
@@ -121,11 +121,6 @@ RUN <<EOT bash
cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/include/* /usr/include/ && \
cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/share/* /usr/share/ && \
rm -rf /opt/vulkan-sdk
elif [ "amd64" = "${TARGETARCH}}" ]; then
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list http://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
apt-get update && \
apt-get install -y vulkan-sdk
fi
if [ "arm64" = "$TARGETARCH" ]; then
mkdir vulkan && cd vulkan && \
@@ -153,14 +148,11 @@ RUN <<EOT bash
apt-get install -y --no-install-recommends \
software-properties-common pciutils
if [ "amd64" = "$TARGETARCH" ]; then
echo https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
fi
if [ "arm64" = "$TARGETARCH" ]; then
if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
else
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
fi
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
fi
dpkg -i cuda-keyring_1.1-1_all.deb && \
rm -f cuda-keyring_1.1-1_all.deb && \
@@ -172,7 +164,7 @@ RUN <<EOT bash
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
if [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi

View File

@@ -60,7 +60,7 @@ RUN <<EOT bash
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
if [ "amd64" = "$TARGETARCH" ] && [ "${VULKAN_FROM_SOURCE}" = "true" ]; then
if [ "amd64" = "$TARGETARCH" ]; then
wget "https://sdk.lunarg.com/sdk/download/1.4.328.1/linux/vulkansdk-linux-x86_64-1.4.328.1.tar.xz" && \
tar -xf vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
rm vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
@@ -78,11 +78,6 @@ RUN <<EOT bash
cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/include/* /usr/include/ && \
cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/share/* /usr/share/ && \
rm -rf /opt/vulkan-sdk
elif [ "amd64" = "${TARGETARCH}}" ]; then
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list http://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
apt-get update && \
apt-get install -y vulkan-sdk
fi
if [ "arm64" = "$TARGETARCH" ]; then
mkdir vulkan && cd vulkan && \
@@ -113,11 +108,7 @@ RUN <<EOT bash
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
fi
if [ "arm64" = "$TARGETARCH" ]; then
if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
else
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
fi
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
fi
dpkg -i cuda-keyring_1.1-1_all.deb && \
rm -f cuda-keyring_1.1-1_all.deb && \
@@ -129,7 +120,7 @@ RUN <<EOT bash
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
if [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=b1377188784f9aea26b8abde56d4aee8c733eec7
LLAMA_VERSION?=ae9f8df77882716b1702df2bed8919499e64cc28
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=

View File

@@ -23,7 +23,6 @@
#include <grpcpp/health_check_service_interface.h>
#include <regex>
#include <atomic>
#include <mutex>
#include <signal.h>
#include <thread>
@@ -391,9 +390,8 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
// Initialize fit_params options (can be overridden by options)
// fit_params: whether to auto-adjust params to fit device memory (default: true as in llama.cpp)
params.fit_params = true;
// fit_params_target: target margin per device in bytes (default: 1GB per device)
// Initialize as vector with default value for all devices
params.fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024 * 1024);
// fit_params_target: target margin per device in bytes (default: 1GB)
params.fit_params_target = 1024 * 1024 * 1024;
// fit_params_min_ctx: minimum context size for fit (default: 4096)
params.fit_params_min_ctx = 4096;
@@ -470,28 +468,10 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
} else if (!strcmp(optname, "fit_params_target") || !strcmp(optname, "fit_target")) {
if (optval != NULL) {
try {
// Value is in MiB, can be comma-separated list for multiple devices
// Single value is broadcast across all devices
std::string arg_next = optval_str;
const std::regex regex{ R"([,/]+)" };
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
std::vector<std::string> split_arg{ it, {} };
if (split_arg.size() >= llama_max_devices()) {
// Too many values provided
continue;
}
if (split_arg.size() == 1) {
// Single value: broadcast to all devices
size_t value_mib = std::stoul(split_arg[0]);
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), value_mib * 1024 * 1024);
} else {
// Multiple values: set per device
for (size_t i = 0; i < split_arg.size() && i < params.fit_params_target.size(); i++) {
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024 * 1024;
}
}
// Value is in MiB, convert to bytes
params.fit_params_target = static_cast<size_t>(std::stoi(optval_str)) * 1024 * 1024;
} catch (const std::exception& e) {
// If conversion fails, keep default value (1GB per device)
// If conversion fails, keep default value (1GB)
}
}
} else if (!strcmp(optname, "fit_params_min_ctx") || !strcmp(optname, "fit_ctx")) {
@@ -706,13 +686,13 @@ private:
public:
BackendServiceImpl(server_context& ctx) : ctx_server(ctx) {}
grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) override {
grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) {
// Implement Health RPC
reply->set_message("OK");
return Status::OK;
}
grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) override {
grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) {
// Implement LoadModel RPC
common_params params;
params_parse(ctx_server, request, params);
@@ -729,72 +709,11 @@ public:
LOG_INF("\n");
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
LOG_INF("\n");
// Capture error messages during model loading
struct error_capture {
std::string captured_error;
std::mutex error_mutex;
ggml_log_callback original_callback;
void* original_user_data;
} error_capture_data;
// Get original log callback
llama_log_get(&error_capture_data.original_callback, &error_capture_data.original_user_data);
// Set custom callback to capture errors
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
auto* capture = static_cast<error_capture*>(user_data);
// Capture error messages
if (level == GGML_LOG_LEVEL_ERROR) {
std::lock_guard<std::mutex> lock(capture->error_mutex);
// Append error message, removing trailing newlines
std::string msg(text);
while (!msg.empty() && (msg.back() == '\n' || msg.back() == '\r')) {
msg.pop_back();
}
if (!msg.empty()) {
if (!capture->captured_error.empty()) {
capture->captured_error.append("; ");
}
capture->captured_error.append(msg);
}
}
// Also call original callback to preserve logging
if (capture->original_callback) {
capture->original_callback(level, text, capture->original_user_data);
}
}, &error_capture_data);
// load the model
bool load_success = ctx_server.load_model(params);
// Restore original log callback
llama_log_set(error_capture_data.original_callback, error_capture_data.original_user_data);
if (!load_success) {
std::string error_msg = "Failed to load model: " + params.model.path;
if (!params.mmproj.path.empty()) {
error_msg += " (with mmproj: " + params.mmproj.path + ")";
}
if (params.has_speculative() && !params.speculative.model.path.empty()) {
error_msg += " (with draft model: " + params.speculative.model.path + ")";
}
// Add captured error details if available
{
std::lock_guard<std::mutex> lock(error_capture_data.error_mutex);
if (!error_capture_data.captured_error.empty()) {
error_msg += ". Error: " + error_capture_data.captured_error;
} else {
error_msg += ". Model file may not exist or be invalid.";
}
}
result->set_message(error_msg);
if (!ctx_server.load_model(params)) {
result->set_message("Failed loading model");
result->set_success(false);
return grpc::Status(grpc::StatusCode::INTERNAL, error_msg);
return Status::CANCELLED;
}
// Process grammar triggers now that vocab is available
@@ -1573,7 +1492,7 @@ public:
return grpc::Status::OK;
}
grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) override {
grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
@@ -2244,7 +2163,7 @@ public:
return grpc::Status::OK;
}
grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) override {
grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
@@ -2339,7 +2258,7 @@ public:
return grpc::Status::OK;
}
grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) override {
grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
if (!params_base.embedding || params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
return grpc::Status(grpc::StatusCode::UNIMPLEMENTED, "This server does not support reranking. Start it with `--reranking` and without `--embedding`");
}
@@ -2425,7 +2344,7 @@ public:
return grpc::Status::OK;
}
grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) override {
grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
@@ -2448,7 +2367,7 @@ public:
return grpc::Status::OK;
}
grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) override {
grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) {
// request slots data using task queue
auto rd = ctx_server.get_response_reader();

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=0e52afc6513cc2dea9a1a017afc4a008d5acf2b0
STABLEDIFFUSION_GGML_VERSION?=9be0b91927dfa4007d053df72dea7302990226bb
CMAKE_ARGS+=-DGGML_MAX_NAME=128

View File

@@ -15,11 +15,14 @@ fi
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi
EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"
if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
USE_PIP=true
# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
# We need uv to continue falling through to the official PyPI index when it encounters these errors.
if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
fi
EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"
installRequirements

View File

@@ -1,5 +0,0 @@
# Build dependencies needed for packages installed from source (e.g., git dependencies)
# When using --no-build-isolation, these must be installed in the venv first
wheel
setuptools
packaging

View File

@@ -16,10 +16,6 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi
if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
USE_PIP=true
fi
# Use python 3.12 for l4t
if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
PYTHON_VERSION="3.12"
@@ -27,4 +23,11 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
PY_STANDALONE_TAG="20251120"
fi
# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
# We need uv to continue falling through to the official PyPI index when it encounters these errors.
if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
fi
installRequirements

View File

@@ -16,8 +16,11 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi
if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
USE_PIP=true
# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
# We need uv to continue falling through to the official PyPI index when it encounters these errors.
if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
fi
installRequirements

View File

@@ -16,6 +16,13 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi
# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
# We need uv to continue falling through to the official PyPI index when it encounters these errors.
if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
fi
if [ "x${BUILD_TYPE}" == "xcublas" ] || [ "x${BUILD_TYPE}" == "xl4t" ]; then
export CMAKE_ARGS="-DGGML_CUDA=on"
fi
@@ -26,12 +33,6 @@ fi
EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"
if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
USE_PIP=true
fi
git clone https://github.com/neuphonic/neutts-air neutts-air
cp -rfv neutts-air/neuttsair ./

View File

@@ -23,8 +23,11 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
PY_STANDALONE_TAG="20251120"
fi
if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
USE_PIP=true
# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
# We need uv to continue falling through to the official PyPI index when it encounters these errors.
if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
fi
installRequirements

View File

@@ -63,25 +63,6 @@ func (m *GalleryBackend) IsMeta() bool {
return len(m.CapabilitiesMap) > 0 && m.URI == ""
}
// IsCompatibleWith checks if the backend is compatible with the current system capability.
// For meta backends, it checks if any of the capabilities in the map match the system capability.
// For concrete backends, it delegates to SystemState.IsBackendCompatible.
func (m *GalleryBackend) IsCompatibleWith(systemState *system.SystemState) bool {
if systemState == nil {
return true
}
// Meta backends are compatible if the system capability matches one of the keys
if m.IsMeta() {
capability := systemState.Capability(m.CapabilitiesMap)
_, exists := m.CapabilitiesMap[capability]
return exists
}
// For concrete backends, delegate to the system package
return systemState.IsBackendCompatible(m.Name, m.URI)
}
func (m *GalleryBackend) SetInstalled(installed bool) {
m.Installed = installed
}

View File

@@ -172,252 +172,6 @@ var _ = Describe("Gallery Backends", func() {
Expect(nilMetaBackend.IsMeta()).To(BeFalse())
})
It("should check IsCompatibleWith correctly for meta backends", func() {
metaBackend := &GalleryBackend{
Metadata: Metadata{
Name: "meta-backend",
},
CapabilitiesMap: map[string]string{
"nvidia": "nvidia-backend",
"amd": "amd-backend",
"default": "default-backend",
},
}
// Test with nil state - should be compatible
Expect(metaBackend.IsCompatibleWith(nil)).To(BeTrue())
// Test with NVIDIA system - should be compatible (has nvidia key)
nvidiaState := &system.SystemState{GPUVendor: "nvidia", VRAM: 8 * 1024 * 1024 * 1024}
Expect(metaBackend.IsCompatibleWith(nvidiaState)).To(BeTrue())
// Test with default (no GPU) - should be compatible (has default key)
defaultState := &system.SystemState{}
Expect(metaBackend.IsCompatibleWith(defaultState)).To(BeTrue())
})
Describe("IsCompatibleWith for concrete backends", func() {
Context("CPU backends", func() {
It("should be compatible on all systems", func() {
cpuBackend := &GalleryBackend{
Metadata: Metadata{
Name: "cpu-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp",
}
Expect(cpuBackend.IsCompatibleWith(&system.SystemState{})).To(BeTrue())
Expect(cpuBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Nvidia, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
Expect(cpuBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.AMD, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
})
})
Context("Darwin/Metal backends", func() {
When("running on darwin", func() {
BeforeEach(func() {
if runtime.GOOS != "darwin" {
Skip("Skipping darwin-specific tests on non-darwin system")
}
})
It("should be compatible for MLX backend", func() {
mlxBackend := &GalleryBackend{
Metadata: Metadata{
Name: "mlx",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx",
}
Expect(mlxBackend.IsCompatibleWith(&system.SystemState{})).To(BeTrue())
})
It("should be compatible for metal-llama-cpp backend", func() {
metalBackend := &GalleryBackend{
Metadata: Metadata{
Name: "metal-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp",
}
Expect(metalBackend.IsCompatibleWith(&system.SystemState{})).To(BeTrue())
})
})
When("running on non-darwin", func() {
BeforeEach(func() {
if runtime.GOOS == "darwin" {
Skip("Skipping non-darwin-specific tests on darwin system")
}
})
It("should NOT be compatible for MLX backend", func() {
mlxBackend := &GalleryBackend{
Metadata: Metadata{
Name: "mlx",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx",
}
Expect(mlxBackend.IsCompatibleWith(&system.SystemState{})).To(BeFalse())
})
It("should NOT be compatible for metal-llama-cpp backend", func() {
metalBackend := &GalleryBackend{
Metadata: Metadata{
Name: "metal-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp",
}
Expect(metalBackend.IsCompatibleWith(&system.SystemState{})).To(BeFalse())
})
})
})
Context("NVIDIA/CUDA backends", func() {
When("running on non-darwin", func() {
BeforeEach(func() {
if runtime.GOOS == "darwin" {
Skip("Skipping CUDA tests on darwin system")
}
})
It("should NOT be compatible without nvidia GPU", func() {
cudaBackend := &GalleryBackend{
Metadata: Metadata{
Name: "cuda12-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp",
}
Expect(cudaBackend.IsCompatibleWith(&system.SystemState{})).To(BeFalse())
Expect(cudaBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.AMD, VRAM: 8 * 1024 * 1024 * 1024})).To(BeFalse())
})
It("should be compatible with nvidia GPU", func() {
cudaBackend := &GalleryBackend{
Metadata: Metadata{
Name: "cuda12-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp",
}
Expect(cudaBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Nvidia, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
})
It("should be compatible with cuda13 backend on nvidia GPU", func() {
cuda13Backend := &GalleryBackend{
Metadata: Metadata{
Name: "cuda13-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp",
}
Expect(cuda13Backend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Nvidia, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
})
})
})
Context("AMD/ROCm backends", func() {
When("running on non-darwin", func() {
BeforeEach(func() {
if runtime.GOOS == "darwin" {
Skip("Skipping AMD/ROCm tests on darwin system")
}
})
It("should NOT be compatible without AMD GPU", func() {
rocmBackend := &GalleryBackend{
Metadata: Metadata{
Name: "rocm-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp",
}
Expect(rocmBackend.IsCompatibleWith(&system.SystemState{})).To(BeFalse())
Expect(rocmBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Nvidia, VRAM: 8 * 1024 * 1024 * 1024})).To(BeFalse())
})
It("should be compatible with AMD GPU", func() {
rocmBackend := &GalleryBackend{
Metadata: Metadata{
Name: "rocm-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp",
}
Expect(rocmBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.AMD, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
})
It("should be compatible with hipblas backend on AMD GPU", func() {
hipBackend := &GalleryBackend{
Metadata: Metadata{
Name: "hip-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-hip-llama-cpp",
}
Expect(hipBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.AMD, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
})
})
})
Context("Intel/SYCL backends", func() {
When("running on non-darwin", func() {
BeforeEach(func() {
if runtime.GOOS == "darwin" {
Skip("Skipping Intel/SYCL tests on darwin system")
}
})
It("should NOT be compatible without Intel GPU", func() {
intelBackend := &GalleryBackend{
Metadata: Metadata{
Name: "intel-sycl-f16-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp",
}
Expect(intelBackend.IsCompatibleWith(&system.SystemState{})).To(BeFalse())
Expect(intelBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Nvidia, VRAM: 8 * 1024 * 1024 * 1024})).To(BeFalse())
})
It("should be compatible with Intel GPU", func() {
intelBackend := &GalleryBackend{
Metadata: Metadata{
Name: "intel-sycl-f16-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp",
}
Expect(intelBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Intel, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
})
It("should be compatible with intel-sycl-f32 backend on Intel GPU", func() {
intelF32Backend := &GalleryBackend{
Metadata: Metadata{
Name: "intel-sycl-f32-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp",
}
Expect(intelF32Backend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Intel, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
})
It("should be compatible with intel-transformers backend on Intel GPU", func() {
intelTransformersBackend := &GalleryBackend{
Metadata: Metadata{
Name: "intel-transformers",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-intel-transformers",
}
Expect(intelTransformersBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Intel, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
})
})
})
Context("Vulkan backends", func() {
It("should be compatible on CPU-only systems", func() {
// Vulkan backends don't have a specific GPU vendor requirement in the current logic
// They are compatible if no other GPU-specific pattern matches
vulkanBackend := &GalleryBackend{
Metadata: Metadata{
Name: "vulkan-llama-cpp",
},
URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp",
}
// Vulkan doesn't have vendor-specific filtering in current implementation
Expect(vulkanBackend.IsCompatibleWith(&system.SystemState{})).To(BeTrue())
})
})
})
It("should find best backend from meta based on system capabilities", func() {
metaBackend := &GalleryBackend{

View File

@@ -226,16 +226,6 @@ func AvailableGalleryModels(galleries []config.Gallery, systemState *system.Syst
// List available backends
func AvailableBackends(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryBackend], error) {
return availableBackendsWithFilter(galleries, systemState, true)
}
// AvailableBackendsUnfiltered returns all available backends without filtering by system capability.
func AvailableBackendsUnfiltered(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryBackend], error) {
return availableBackendsWithFilter(galleries, systemState, false)
}
// availableBackendsWithFilter is a helper function that lists available backends with optional filtering.
func availableBackendsWithFilter(galleries []config.Gallery, systemState *system.SystemState, filterByCapability bool) (GalleryElements[*GalleryBackend], error) {
var backends []*GalleryBackend
systemBackends, err := ListSystemBackends(systemState)
@@ -251,17 +241,7 @@ func availableBackendsWithFilter(galleries []config.Gallery, systemState *system
if err != nil {
return nil, err
}
// Filter backends by system capability if requested
if filterByCapability {
for _, backend := range galleryBackends {
if backend.IsCompatibleWith(systemState) {
backends = append(backends, backend)
}
}
} else {
backends = append(backends, galleryBackends...)
}
backends = append(backends, galleryBackends...)
}
return backends, nil

View File

@@ -205,7 +205,6 @@ func API(application *application.Application) (*echo.Echo, error) {
routes.RegisterLocalAIRoutes(e, requestExtractor, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.GalleryService(), opcache, application.TemplatesEvaluator(), application)
routes.RegisterOpenAIRoutes(e, requestExtractor, application)
routes.RegisterAnthropicRoutes(e, requestExtractor, application)
if !application.ApplicationConfig().DisableWebUI {
routes.RegisterUIAPIRoutes(e, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.GalleryService(), opcache, application)
routes.RegisterUIRoutes(e, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.GalleryService())

View File

@@ -1,537 +0,0 @@
package anthropic
import (
"encoding/json"
"fmt"
"github.com/google/uuid"
"github.com/labstack/echo/v4"
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/http/middleware"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/core/templates"
"github.com/mudler/LocalAI/pkg/functions"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/xlog"
)
// MessagesEndpoint is the Anthropic Messages API endpoint
// https://docs.anthropic.com/claude/reference/messages_post
// @Summary Generate a message response for the given messages and model.
// @Param request body schema.AnthropicRequest true "query params"
// @Success 200 {object} schema.AnthropicResponse "Response"
// @Router /v1/messages [post]
func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
return func(c echo.Context) error {
id := uuid.New().String()
input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.AnthropicRequest)
if !ok || input.Model == "" {
return sendAnthropicError(c, 400, "invalid_request_error", "model is required")
}
cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
if !ok || cfg == nil {
return sendAnthropicError(c, 400, "invalid_request_error", "model configuration not found")
}
if input.MaxTokens <= 0 {
return sendAnthropicError(c, 400, "invalid_request_error", "max_tokens is required and must be greater than 0")
}
xlog.Debug("Anthropic Messages endpoint configuration read", "config", cfg)
// Convert Anthropic messages to OpenAI format for internal processing
openAIMessages := convertAnthropicToOpenAIMessages(input)
// Convert Anthropic tools to internal Functions format
funcs, shouldUseFn := convertAnthropicTools(input, cfg)
// Create an OpenAI-compatible request for internal processing
openAIReq := &schema.OpenAIRequest{
PredictionOptions: schema.PredictionOptions{
BasicModelRequest: schema.BasicModelRequest{Model: input.Model},
Temperature: input.Temperature,
TopK: input.TopK,
TopP: input.TopP,
Maxtokens: &input.MaxTokens,
},
Messages: openAIMessages,
Stream: input.Stream,
Context: input.Context,
Cancel: input.Cancel,
}
// Set stop sequences
if len(input.StopSequences) > 0 {
openAIReq.Stop = input.StopSequences
}
// Merge config settings
if input.Temperature != nil {
cfg.Temperature = input.Temperature
}
if input.TopK != nil {
cfg.TopK = input.TopK
}
if input.TopP != nil {
cfg.TopP = input.TopP
}
cfg.Maxtokens = &input.MaxTokens
if len(input.StopSequences) > 0 {
cfg.StopWords = append(cfg.StopWords, input.StopSequences...)
}
// Template the prompt with tools if available
predInput := evaluator.TemplateMessages(*openAIReq, openAIReq.Messages, cfg, funcs, shouldUseFn)
xlog.Debug("Anthropic Messages - Prompt (after templating)", "prompt", predInput)
if input.Stream {
return handleAnthropicStream(c, id, input, cfg, ml, predInput, openAIReq, funcs, shouldUseFn)
}
return handleAnthropicNonStream(c, id, input, cfg, ml, predInput, openAIReq, funcs, shouldUseFn)
}
}
func handleAnthropicNonStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool) error {
images := []string{}
for _, m := range openAIReq.Messages {
images = append(images, m.StringImages...)
}
predFunc, err := backend.ModelInference(
input.Context, predInput, openAIReq.Messages, images, nil, nil, ml, cfg, nil, nil, nil, "", "", nil, nil, nil)
if err != nil {
xlog.Error("Anthropic model inference failed", "error", err)
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
}
prediction, err := predFunc()
if err != nil {
xlog.Error("Anthropic prediction failed", "error", err)
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
}
result := backend.Finetune(*cfg, predInput, prediction.Response)
// Check if the result contains tool calls
toolCalls := functions.ParseFunctionCall(result, cfg.FunctionsConfig)
var contentBlocks []schema.AnthropicContentBlock
var stopReason string
if shouldUseFn && len(toolCalls) > 0 {
// Model wants to use tools
stopReason = "tool_use"
for _, tc := range toolCalls {
// Parse arguments as JSON
var inputArgs map[string]interface{}
if err := json.Unmarshal([]byte(tc.Arguments), &inputArgs); err != nil {
xlog.Warn("Failed to parse tool call arguments as JSON", "error", err, "args", tc.Arguments)
inputArgs = map[string]interface{}{"raw": tc.Arguments}
}
contentBlocks = append(contentBlocks, schema.AnthropicContentBlock{
Type: "tool_use",
ID: fmt.Sprintf("toolu_%s_%d", id, len(contentBlocks)),
Name: tc.Name,
Input: inputArgs,
})
}
// Add any text content before the tool calls
textContent := functions.ParseTextContent(result, cfg.FunctionsConfig)
if textContent != "" {
// Prepend text block
contentBlocks = append([]schema.AnthropicContentBlock{{Type: "text", Text: textContent}}, contentBlocks...)
}
} else {
// Normal text response
stopReason = "end_turn"
contentBlocks = []schema.AnthropicContentBlock{
{Type: "text", Text: result},
}
}
resp := &schema.AnthropicResponse{
ID: fmt.Sprintf("msg_%s", id),
Type: "message",
Role: "assistant",
Model: input.Model,
StopReason: &stopReason,
Content: contentBlocks,
Usage: schema.AnthropicUsage{
InputTokens: prediction.Usage.Prompt,
OutputTokens: prediction.Usage.Completion,
},
}
if respData, err := json.Marshal(resp); err == nil {
xlog.Debug("Anthropic Response", "response", string(respData))
}
return c.JSON(200, resp)
}
func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool) error {
c.Response().Header().Set("Content-Type", "text/event-stream")
c.Response().Header().Set("Cache-Control", "no-cache")
c.Response().Header().Set("Connection", "keep-alive")
// Create OpenAI messages for inference
openAIMessages := openAIReq.Messages
images := []string{}
for _, m := range openAIMessages {
images = append(images, m.StringImages...)
}
// Send message_start event
messageStart := schema.AnthropicStreamEvent{
Type: "message_start",
Message: &schema.AnthropicStreamMessage{
ID: fmt.Sprintf("msg_%s", id),
Type: "message",
Role: "assistant",
Content: []schema.AnthropicContentBlock{},
Model: input.Model,
Usage: schema.AnthropicUsage{InputTokens: 0, OutputTokens: 0},
},
}
sendAnthropicSSE(c, messageStart)
// Track accumulated content for tool call detection
accumulatedContent := ""
currentBlockIndex := 0
inToolCall := false
toolCallsEmitted := 0
// Send initial content_block_start event
contentBlockStart := schema.AnthropicStreamEvent{
Type: "content_block_start",
Index: currentBlockIndex,
ContentBlock: &schema.AnthropicContentBlock{Type: "text", Text: ""},
}
sendAnthropicSSE(c, contentBlockStart)
// Stream content deltas
tokenCallback := func(token string, usage backend.TokenUsage) bool {
accumulatedContent += token
// If we're using functions, try to detect tool calls incrementally
if shouldUseFn {
cleanedResult := functions.CleanupLLMResult(accumulatedContent, cfg.FunctionsConfig)
// Try parsing for tool calls
toolCalls := functions.ParseFunctionCall(cleanedResult, cfg.FunctionsConfig)
// If we detected new tool calls and haven't emitted them yet
if len(toolCalls) > toolCallsEmitted {
// Stop the current text block if we were in one
if !inToolCall && currentBlockIndex == 0 {
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: currentBlockIndex,
})
currentBlockIndex++
inToolCall = true
}
// Emit new tool calls
for i := toolCallsEmitted; i < len(toolCalls); i++ {
tc := toolCalls[i]
// Send content_block_start for tool_use
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_start",
Index: currentBlockIndex,
ContentBlock: &schema.AnthropicContentBlock{
Type: "tool_use",
ID: fmt.Sprintf("toolu_%s_%d", id, i),
Name: tc.Name,
},
})
// Send input_json_delta with the arguments
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: currentBlockIndex,
Delta: &schema.AnthropicStreamDelta{
Type: "input_json_delta",
PartialJSON: tc.Arguments,
},
})
// Send content_block_stop
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: currentBlockIndex,
})
currentBlockIndex++
}
toolCallsEmitted = len(toolCalls)
return true
}
}
// Send regular text delta if not in tool call mode
if !inToolCall {
delta := schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: 0,
Delta: &schema.AnthropicStreamDelta{
Type: "text_delta",
Text: token,
},
}
sendAnthropicSSE(c, delta)
}
return true
}
predFunc, err := backend.ModelInference(
input.Context, predInput, openAIMessages, images, nil, nil, ml, cfg, nil, nil, tokenCallback, "", "", nil, nil, nil)
if err != nil {
xlog.Error("Anthropic stream model inference failed", "error", err)
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
}
prediction, err := predFunc()
if err != nil {
xlog.Error("Anthropic stream prediction failed", "error", err)
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
}
// Send content_block_stop event for last block if we didn't close it yet
if !inToolCall {
contentBlockStop := schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: 0,
}
sendAnthropicSSE(c, contentBlockStop)
}
// Determine stop reason
stopReason := "end_turn"
if toolCallsEmitted > 0 {
stopReason = "tool_use"
}
// Send message_delta event with stop_reason
messageDelta := schema.AnthropicStreamEvent{
Type: "message_delta",
Delta: &schema.AnthropicStreamDelta{
StopReason: &stopReason,
},
Usage: &schema.AnthropicUsage{
OutputTokens: prediction.Usage.Completion,
},
}
sendAnthropicSSE(c, messageDelta)
// Send message_stop event
messageStop := schema.AnthropicStreamEvent{
Type: "message_stop",
}
sendAnthropicSSE(c, messageStop)
return nil
}
func sendAnthropicSSE(c echo.Context, event schema.AnthropicStreamEvent) {
data, err := json.Marshal(event)
if err != nil {
xlog.Error("Failed to marshal SSE event", "error", err)
return
}
fmt.Fprintf(c.Response().Writer, "event: %s\ndata: %s\n\n", event.Type, string(data))
c.Response().Flush()
}
func sendAnthropicError(c echo.Context, statusCode int, errorType, message string) error {
resp := schema.AnthropicErrorResponse{
Type: "error",
Error: schema.AnthropicError{
Type: errorType,
Message: message,
},
}
return c.JSON(statusCode, resp)
}
func convertAnthropicToOpenAIMessages(input *schema.AnthropicRequest) []schema.Message {
var messages []schema.Message
// Add system message if present
if input.System != "" {
messages = append(messages, schema.Message{
Role: "system",
StringContent: input.System,
Content: input.System,
})
}
// Convert Anthropic messages to OpenAI format
for _, msg := range input.Messages {
openAIMsg := schema.Message{
Role: msg.Role,
}
// Handle content (can be string or array of content blocks)
switch content := msg.Content.(type) {
case string:
openAIMsg.StringContent = content
openAIMsg.Content = content
case []interface{}:
// Handle array of content blocks
var textContent string
var stringImages []string
var toolCalls []schema.ToolCall
toolCallIndex := 0
for _, block := range content {
if blockMap, ok := block.(map[string]interface{}); ok {
blockType, _ := blockMap["type"].(string)
switch blockType {
case "text":
if text, ok := blockMap["text"].(string); ok {
textContent += text
}
case "image":
// Handle image content
if source, ok := blockMap["source"].(map[string]interface{}); ok {
if sourceType, ok := source["type"].(string); ok && sourceType == "base64" {
if data, ok := source["data"].(string); ok {
mediaType, _ := source["media_type"].(string)
// Format as data URI
dataURI := fmt.Sprintf("data:%s;base64,%s", mediaType, data)
stringImages = append(stringImages, dataURI)
}
}
}
case "tool_use":
// Convert tool_use to ToolCall format
toolID, _ := blockMap["id"].(string)
toolName, _ := blockMap["name"].(string)
toolInput := blockMap["input"]
// Serialize input to JSON string
inputJSON, err := json.Marshal(toolInput)
if err != nil {
xlog.Warn("Failed to marshal tool input", "error", err)
inputJSON = []byte("{}")
}
toolCalls = append(toolCalls, schema.ToolCall{
Index: toolCallIndex,
ID: toolID,
Type: "function",
FunctionCall: schema.FunctionCall{
Name: toolName,
Arguments: string(inputJSON),
},
})
toolCallIndex++
case "tool_result":
// Convert tool_result to a message with role "tool"
// This is handled by creating a separate message after this block
// For now, we'll add it as text content
toolUseID, _ := blockMap["tool_use_id"].(string)
isError := false
if isErrorPtr, ok := blockMap["is_error"].(*bool); ok && isErrorPtr != nil {
isError = *isErrorPtr
}
var resultText string
if resultContent, ok := blockMap["content"]; ok {
switch rc := resultContent.(type) {
case string:
resultText = rc
case []interface{}:
// Array of content blocks
for _, cb := range rc {
if cbMap, ok := cb.(map[string]interface{}); ok {
if cbMap["type"] == "text" {
if text, ok := cbMap["text"].(string); ok {
resultText += text
}
}
}
}
}
}
// Add tool result as a tool role message
// We need to handle this differently - create a new message
if msg.Role == "user" {
// Store tool result info for creating separate message
prefix := ""
if isError {
prefix = "Error: "
}
textContent += fmt.Sprintf("\n[Tool Result for %s]: %s%s", toolUseID, prefix, resultText)
}
}
}
}
openAIMsg.StringContent = textContent
openAIMsg.Content = textContent
openAIMsg.StringImages = stringImages
// Add tool calls if present
if len(toolCalls) > 0 {
openAIMsg.ToolCalls = toolCalls
}
}
messages = append(messages, openAIMsg)
}
return messages
}
// convertAnthropicTools converts Anthropic tools to internal Functions format
func convertAnthropicTools(input *schema.AnthropicRequest, cfg *config.ModelConfig) (functions.Functions, bool) {
if len(input.Tools) == 0 {
return nil, false
}
var funcs functions.Functions
for _, tool := range input.Tools {
f := functions.Function{
Name: tool.Name,
Description: tool.Description,
Parameters: tool.InputSchema,
}
funcs = append(funcs, f)
}
// Handle tool_choice
if input.ToolChoice != nil {
switch tc := input.ToolChoice.(type) {
case string:
// "auto", "any", or "none"
if tc == "any" {
// Force the model to use one of the tools
cfg.SetFunctionCallString("required")
} else if tc == "none" {
// Don't use tools
return nil, false
}
// "auto" is the default - let model decide
case map[string]interface{}:
// Specific tool selection: {"type": "tool", "name": "tool_name"}
if tcType, ok := tc["type"].(string); ok && tcType == "tool" {
if name, ok := tc["name"].(string); ok {
// Force specific tool
cfg.SetFunctionCallString(name)
}
}
}
}
return funcs, len(funcs) > 0 && cfg.ShouldUseFunctions()
}

View File

@@ -3,7 +3,6 @@ package openai
import (
"encoding/json"
"fmt"
"strings"
"time"
"github.com/google/uuid"
@@ -35,54 +34,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
Created: created,
Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0, FinishReason: nil}},
Object: "chat.completion.chunk",
}
responses <- initialMessage
// Track accumulated content for reasoning extraction
accumulatedContent := ""
lastEmittedReasoning := ""
lastEmittedCleanedContent := ""
_, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
accumulatedContent += s
// Extract reasoning from accumulated content
currentReasoning, cleanedContent := functions.ExtractReasoning(accumulatedContent)
// Calculate new reasoning delta (what we haven't emitted yet)
var reasoningDelta *string
if currentReasoning != lastEmittedReasoning {
// Extract only the new part
if len(currentReasoning) > len(lastEmittedReasoning) && strings.HasPrefix(currentReasoning, lastEmittedReasoning) {
newReasoning := currentReasoning[len(lastEmittedReasoning):]
reasoningDelta = &newReasoning
lastEmittedReasoning = currentReasoning
} else if currentReasoning != "" {
// If reasoning changed in a non-append way, emit the full current reasoning
reasoningDelta = &currentReasoning
lastEmittedReasoning = currentReasoning
}
}
// Calculate content delta from cleaned content
var deltaContent string
if len(cleanedContent) > len(lastEmittedCleanedContent) && strings.HasPrefix(cleanedContent, lastEmittedCleanedContent) {
deltaContent = cleanedContent[len(lastEmittedCleanedContent):]
lastEmittedCleanedContent = cleanedContent
} else if cleanedContent != lastEmittedCleanedContent {
// If cleaned content changed but not in a simple append, extract delta from cleaned content
// This handles cases where thinking tags are removed mid-stream
if lastEmittedCleanedContent == "" {
deltaContent = cleanedContent
lastEmittedCleanedContent = cleanedContent
} else {
// Content changed in non-append way, use the new cleaned content
deltaContent = cleanedContent
lastEmittedCleanedContent = cleanedContent
}
}
// Only emit content if there's actual content (not just thinking tags)
// If deltaContent is empty, we still emit the response but with empty content
usage := schema.OpenAIUsage{
PromptTokens: tokenUsage.Prompt,
CompletionTokens: tokenUsage.Completion,
@@ -93,20 +49,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
}
delta := &schema.Message{}
// Only include content if there's actual content (not just thinking tags)
if deltaContent != "" {
delta.Content = &deltaContent
}
if reasoningDelta != nil && *reasoningDelta != "" {
delta.Reasoning = reasoningDelta
}
resp := schema.OpenAIResponse{
ID: id,
Created: created,
Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: []schema.Choice{{Delta: delta, Index: 0, FinishReason: nil}},
Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0, FinishReason: nil}},
Object: "chat.completion.chunk",
Usage: usage,
}
@@ -229,10 +176,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
if err != nil {
return err
}
// Extract reasoning before processing tool calls
reasoning, cleanedResult := functions.ExtractReasoning(result)
result = cleanedResult
textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
result = functions.CleanupLLMResult(result, config.FunctionsConfig)
functionResults := functions.ParseFunctionCall(result, config.FunctionsConfig)
@@ -265,20 +208,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
}
var deltaReasoning *string
if reasoning != "" {
deltaReasoning = &reasoning
}
delta := &schema.Message{Content: &result}
if deltaReasoning != nil {
delta.Reasoning = deltaReasoning
}
resp := schema.OpenAIResponse{
ID: id,
Created: created,
Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: []schema.Choice{{Delta: delta, Index: 0, FinishReason: nil}},
Choices: []schema.Choice{{Delta: &schema.Message{Content: &result}, Index: 0, FinishReason: nil}},
Object: "chat.completion.chunk",
Usage: usage,
}
@@ -619,18 +553,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
default:
tokenCallback := func(s string, c *[]schema.Choice) {
// Extract reasoning from the response
reasoning, cleanedS := functions.ExtractReasoning(s)
s = cleanedS
if !shouldUseFn {
// no function is called, just reply and use stop as finish reason
stopReason := FinishReasonStop
message := &schema.Message{Role: "assistant", Content: &s}
if reasoning != "" {
message.Reasoning = &reasoning
}
*c = append(*c, schema.Choice{FinishReason: &stopReason, Index: 0, Message: message})
*c = append(*c, schema.Choice{FinishReason: &stopReason, Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
return
}
@@ -649,13 +575,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
}
stopReason := FinishReasonStop
message := &schema.Message{Role: "assistant", Content: &result}
if reasoning != "" {
message.Reasoning = &reasoning
}
*c = append(*c, schema.Choice{
FinishReason: &stopReason,
Message: message})
Message: &schema.Message{Role: "assistant", Content: &result}})
default:
toolCallsReason := FinishReasonToolCalls
toolChoice := schema.Choice{
@@ -664,9 +586,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
Role: "assistant",
},
}
if reasoning != "" {
toolChoice.Message.Reasoning = &reasoning
}
for _, ss := range results {
name, args := ss.Name, ss.Arguments
@@ -687,20 +606,16 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
} else {
// otherwise we return more choices directly (deprecated)
functionCallReason := FinishReasonFunctionCall
message := &schema.Message{
Role: "assistant",
Content: &textContentToReturn,
FunctionCall: map[string]interface{}{
"name": name,
"arguments": args,
},
}
if reasoning != "" {
message.Reasoning = &reasoning
}
*c = append(*c, schema.Choice{
FinishReason: &functionCallReason,
Message: message,
Message: &schema.Message{
Role: "assistant",
Content: &textContentToReturn,
FunctionCall: map[string]interface{}{
"name": name,
"arguments": args,
},
},
})
}
}

View File

@@ -1,108 +0,0 @@
package routes
import (
"context"
"fmt"
"net/http"
"github.com/google/uuid"
"github.com/labstack/echo/v4"
"github.com/mudler/LocalAI/core/application"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/http/endpoints/anthropic"
"github.com/mudler/LocalAI/core/http/middleware"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/xlog"
)
func RegisterAnthropicRoutes(app *echo.Echo,
re *middleware.RequestExtractor,
application *application.Application) {
// Anthropic Messages API endpoint
messagesHandler := anthropic.MessagesEndpoint(
application.ModelConfigLoader(),
application.ModelLoader(),
application.TemplatesEvaluator(),
application.ApplicationConfig(),
)
messagesMiddleware := []echo.MiddlewareFunc{
middleware.TraceMiddleware(application),
re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.AnthropicRequest) }),
setAnthropicRequestContext(application.ApplicationConfig()),
}
// Main Anthropic endpoint
app.POST("/v1/messages", messagesHandler, messagesMiddleware...)
// Also support without version prefix for compatibility
app.POST("/messages", messagesHandler, messagesMiddleware...)
}
// setAnthropicRequestContext sets up the context and cancel function for Anthropic requests
func setAnthropicRequestContext(appConfig *config.ApplicationConfig) echo.MiddlewareFunc {
return func(next echo.HandlerFunc) echo.HandlerFunc {
return func(c echo.Context) error {
input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.AnthropicRequest)
if !ok || input.Model == "" {
return echo.NewHTTPError(http.StatusBadRequest, "model is required")
}
cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
if !ok || cfg == nil {
return echo.NewHTTPError(http.StatusBadRequest, "model configuration not found")
}
// Extract or generate the correlation ID
// Anthropic uses x-request-id header
correlationID := c.Request().Header.Get("x-request-id")
if correlationID == "" {
correlationID = uuid.New().String()
}
c.Response().Header().Set("x-request-id", correlationID)
// Set up context with cancellation
reqCtx := c.Request().Context()
c1, cancel := context.WithCancel(appConfig.Context)
// Cancel when request context is cancelled (client disconnects)
go func() {
select {
case <-reqCtx.Done():
cancel()
case <-c1.Done():
// Already cancelled
}
}()
// Add the correlation ID to the new context
ctxWithCorrelationID := context.WithValue(c1, middleware.CorrelationIDKey, correlationID)
input.Context = ctxWithCorrelationID
input.Cancel = cancel
if cfg.Model == "" {
xlog.Debug("replacing empty cfg.Model with input value", "input.Model", input.Model)
cfg.Model = input.Model
}
c.Set(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST, input)
c.Set(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG, cfg)
// Log the Anthropic API version if provided
anthropicVersion := c.Request().Header.Get("anthropic-version")
if anthropicVersion != "" {
xlog.Debug("Anthropic API version", "version", anthropicVersion)
}
// Validate max_tokens is provided
if input.MaxTokens <= 0 {
return echo.NewHTTPError(http.StatusBadRequest, fmt.Sprintf("max_tokens is required and must be greater than 0"))
}
return next(c)
}
}
}

View File

@@ -617,12 +617,6 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
installedBackendsCount = len(installedBackends)
}
// Get the detected system capability
detectedCapability := ""
if appConfig.SystemState != nil {
detectedCapability = appConfig.SystemState.DetectedCapability()
}
return c.JSON(200, map[string]interface{}{
"backends": backendsJSON,
"repositories": appConfig.BackendGalleries,
@@ -635,7 +629,6 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
"totalPages": totalPages,
"prevPage": prevPage,
"nextPage": nextPage,
"systemCapability": detectedCapability,
})
})

View File

@@ -1368,7 +1368,6 @@ async function promptGPT(systemPrompt, input) {
let lastAssistantMessageIndex = -1;
let lastThinkingMessageIndex = -1;
let lastThinkingScrollTime = 0;
let hasReasoningFromAPI = false; // Track if we're receiving reasoning from API (skip tag-based detection)
const THINKING_SCROLL_THROTTLE = 200; // Throttle scrolling to every 200ms
try {
@@ -1402,24 +1401,19 @@ async function promptGPT(systemPrompt, input) {
// Handle different event types
switch (eventData.type) {
case "reasoning":
hasReasoningFromAPI = true; // Mark that we're receiving reasoning from API
if (eventData.content) {
const currentChat = chatStore.getChat(chatId);
if (!currentChat) break; // Chat was deleted
const isMCPMode = currentChat.mcpMode || false;
const shouldExpand = !isMCPMode; // Expanded in non-MCP mode, collapsed in MCP mode
// Insert thinking before assistant message if it exists (always use "thinking" role)
// Insert reasoning before assistant message if it exists
if (lastAssistantMessageIndex >= 0 && targetHistory[lastAssistantMessageIndex]?.role === "assistant") {
targetHistory.splice(lastAssistantMessageIndex, 0, {
role: "thinking",
role: "reasoning",
content: eventData.content,
html: DOMPurify.sanitize(marked.parse(eventData.content)),
image: [],
audio: [],
expanded: shouldExpand
expanded: false // Reasoning is always collapsed
});
lastAssistantMessageIndex++; // Adjust index since we inserted
// Scroll smoothly after adding thinking
// Scroll smoothly after adding reasoning
setTimeout(() => {
const chatContainer = document.getElementById('chat');
if (chatContainer) {
@@ -1431,7 +1425,7 @@ async function promptGPT(systemPrompt, input) {
}, 100);
} else {
// No assistant message yet, just add normally
chatStore.add("thinking", eventData.content, null, null, chatId);
chatStore.add("reasoning", eventData.content, null, null, chatId);
}
}
break;
@@ -1497,17 +1491,14 @@ async function promptGPT(systemPrompt, input) {
// Only update display if this is the active chat (interval will handle it)
// Don't call updateTokensPerSecond here to avoid unnecessary updates
// Only check for thinking tags if we're NOT receiving reasoning from API
// This prevents duplicate thinking/reasoning messages
if (!hasReasoningFromAPI) {
// Check for thinking tags in the chunk (incremental detection)
if (contentChunk.includes("<thinking>") || contentChunk.includes("<think>")) {
isThinking = true;
thinkingContent = "";
lastThinkingMessageIndex = -1;
}
if (contentChunk.includes("</thinking>") || contentChunk.includes("</think>")) {
// Check for thinking tags in the chunk (incremental detection)
if (contentChunk.includes("<thinking>") || contentChunk.includes("<think>")) {
isThinking = true;
thinkingContent = "";
lastThinkingMessageIndex = -1;
}
if (contentChunk.includes("</thinking>") || contentChunk.includes("</think>")) {
isThinking = false;
// When closing tag is detected, process the accumulated thinking content
if (thinkingContent.trim()) {
@@ -1561,11 +1552,10 @@ async function promptGPT(systemPrompt, input) {
}
thinkingContent = "";
}
}
}
// Handle content based on thinking state (only if not receiving reasoning from API)
if (!hasReasoningFromAPI && isThinking) {
// Handle content based on thinking state
if (isThinking) {
thinkingContent += contentChunk;
const currentChat = chatStore.getChat(chatId);
if (!currentChat) break; // Chat was deleted
@@ -1647,10 +1637,7 @@ async function promptGPT(systemPrompt, input) {
// Process any thinking tags that might be in the accumulated content
// This handles cases where tags are split across chunks
// Only process if we're NOT receiving reasoning from API (to avoid duplicates)
const { regularContent: processedRegular, thinkingContent: processedThinking } = hasReasoningFromAPI
? { regularContent: regularContent, thinkingContent: "" }
: processThinkingTags(regularContent);
const { regularContent: processedRegular, thinkingContent: processedThinking } = processThinkingTags(regularContent);
// Update or create assistant message with processed regular content
const currentChat = chatStore.getChat(chatId);
@@ -1658,10 +1645,10 @@ async function promptGPT(systemPrompt, input) {
const request = activeRequests.get(chatId);
const requestModel = request?.model || null;
if (lastAssistantMessageIndex === -1) {
// Create assistant message if we have any content (even if empty string after processing)
// This ensures the message is created and can be updated with more content later
chatStore.add("assistant", processedRegular || "", null, null, chatId, requestModel);
lastAssistantMessageIndex = targetHistory.length - 1;
if (processedRegular && processedRegular.trim()) {
chatStore.add("assistant", processedRegular, null, null, chatId, requestModel);
lastAssistantMessageIndex = targetHistory.length - 1;
}
} else {
const lastMessage = targetHistory[lastAssistantMessageIndex];
if (lastMessage && lastMessage.role === "assistant") {
@@ -1699,10 +1686,7 @@ async function promptGPT(systemPrompt, input) {
if (assistantContentBuffer.length > 0) {
const regularContent = assistantContentBuffer.join("");
// Process any remaining thinking tags that might be in the buffer
// Only process if we're NOT receiving reasoning from API (to avoid duplicates)
const { regularContent: processedRegular, thinkingContent: processedThinking } = hasReasoningFromAPI
? { regularContent: regularContent, thinkingContent: "" }
: processThinkingTags(regularContent);
const { regularContent: processedRegular, thinkingContent: processedThinking } = processThinkingTags(regularContent);
const currentChat = chatStore.getChat(chatId);
if (!currentChat) {
@@ -1735,26 +1719,23 @@ async function promptGPT(systemPrompt, input) {
}
// Then update or create assistant message
// Always create/update assistant message if we have any content
if (lastAssistantMessageIndex !== -1) {
const lastMessage = targetHistory[lastAssistantMessageIndex];
if (lastMessage && lastMessage.role === "assistant") {
lastMessage.content = (lastMessage.content || "") + (processedRegular || "");
lastMessage.html = DOMPurify.sanitize(marked.parse(lastMessage.content));
}
} else {
// Create assistant message (even if empty, so it can be updated with more content)
} else if (processedRegular && processedRegular.trim()) {
const request = activeRequests.get(chatId);
const requestModel = request?.model || null;
chatStore.add("assistant", processedRegular || "", null, null, chatId, requestModel);
chatStore.add("assistant", processedRegular, null, null, chatId, requestModel);
lastAssistantMessageIndex = targetHistory.length - 1;
}
}
// Final thinking content flush if any data remains (from incremental detection)
// Only process if we're NOT receiving reasoning from API (to avoid duplicates)
const finalChat = chatStore.getChat(chatId);
if (finalChat && !hasReasoningFromAPI && thinkingContent.trim() && lastThinkingMessageIndex === -1) {
if (finalChat && thinkingContent.trim() && lastThinkingMessageIndex === -1) {
const finalHistory = finalChat.history;
// Extract thinking content if tags are present
const thinkingMatch = thinkingContent.match(/<(?:thinking|redacted_reasoning)>(.*?)<\/(?:thinking|redacted_reasoning)>/s);
@@ -1910,13 +1891,9 @@ async function promptGPT(systemPrompt, input) {
let buffer = "";
let contentBuffer = [];
let thinkingContent = "";
let reasoningContent = ""; // Track reasoning from API reasoning field
let isThinking = false;
let lastThinkingMessageIndex = -1;
let lastReasoningMessageIndex = -1; // Track reasoning message separately
let lastAssistantMessageIndex = -1; // Track assistant message for reasoning placement
let lastThinkingScrollTime = 0;
let hasReasoningFromAPI = false; // Track if we're receiving reasoning from API (skip tag-based detection)
const THINKING_SCROLL_THROTTLE = 200; // Throttle scrolling to every 200ms
try {
@@ -1952,100 +1929,30 @@ async function promptGPT(systemPrompt, input) {
chatStore.updateTokenUsage(jsonData.usage, chatId);
}
const token = jsonData.choices?.[0]?.delta?.content;
const reasoningDelta = jsonData.choices?.[0]?.delta?.reasoning;
const token = jsonData.choices[0].delta.content;
// Handle reasoning from API reasoning field - always use "thinking" role
if (reasoningDelta && reasoningDelta.trim() !== "") {
hasReasoningFromAPI = true; // Mark that we're receiving reasoning from API
reasoningContent += reasoningDelta;
const currentChat = chatStore.getChat(chatId);
if (!currentChat) {
// Chat was deleted, skip this line
if (token) {
// Check for thinking tags
if (token.includes("<thinking>") || token.includes("<think>")) {
isThinking = true;
thinkingContent = "";
lastThinkingMessageIndex = -1;
return;
}
const isMCPMode = currentChat.mcpMode || false;
const shouldExpand = !isMCPMode; // Expanded in non-MCP mode, collapsed in MCP mode
// Only create/update thinking message if we have actual content
if (reasoningContent.trim() !== "") {
// Update or create thinking message (always use "thinking" role, not "reasoning")
if (lastReasoningMessageIndex === -1) {
// Find the last assistant message index to insert thinking before it
const targetHistory = currentChat.history;
const assistantIndex = targetHistory.length - 1;
if (assistantIndex >= 0 && targetHistory[assistantIndex]?.role === "assistant") {
// Insert thinking before assistant message
targetHistory.splice(assistantIndex, 0, {
role: "thinking",
content: reasoningContent,
html: DOMPurify.sanitize(marked.parse(reasoningContent)),
image: [],
audio: [],
expanded: shouldExpand
});
lastReasoningMessageIndex = assistantIndex;
lastAssistantMessageIndex = assistantIndex + 1; // Adjust for inserted thinking
} else {
// No assistant message yet, just add normally
chatStore.add("thinking", reasoningContent, null, null, chatId);
lastReasoningMessageIndex = currentChat.history.length - 1;
}
} else {
// Update existing thinking message
const targetHistory = currentChat.history;
if (lastReasoningMessageIndex >= 0 && lastReasoningMessageIndex < targetHistory.length) {
const thinkingMessage = targetHistory[lastReasoningMessageIndex];
if (thinkingMessage && thinkingMessage.role === "thinking") {
thinkingMessage.content = reasoningContent;
thinkingMessage.html = DOMPurify.sanitize(marked.parse(reasoningContent));
}
if (token.includes("</thinking>") || token.includes("</think>")) {
isThinking = false;
if (thinkingContent.trim()) {
// Only add the final thinking message if we don't already have one
if (lastThinkingMessageIndex === -1) {
chatStore.add("thinking", thinkingContent, null, null, chatId);
}
}
return;
}
// Scroll when reasoning is updated (throttled)
const now = Date.now();
if (now - lastThinkingScrollTime > THINKING_SCROLL_THROTTLE) {
lastThinkingScrollTime = now;
setTimeout(() => {
const chatContainer = document.getElementById('chat');
if (chatContainer) {
chatContainer.scrollTo({
top: chatContainer.scrollHeight,
behavior: 'smooth'
});
}
scrollThinkingBoxToBottom();
}, 100);
}
}
if (token && token.trim() !== "") {
// Only check for thinking tags if we're NOT receiving reasoning from API
// This prevents duplicate thinking/reasoning messages
if (!hasReasoningFromAPI) {
// Check for thinking tags (legacy support - models that output tags directly)
if (token.includes("<thinking>") || token.includes("<think>")) {
isThinking = true;
thinkingContent = "";
lastThinkingMessageIndex = -1;
return;
}
if (token.includes("</thinking>") || token.includes("</think>")) {
isThinking = false;
if (thinkingContent.trim()) {
// Only add the final thinking message if we don't already have one
if (lastThinkingMessageIndex === -1) {
chatStore.add("thinking", thinkingContent, null, null, chatId);
}
}
return;
}
// Handle content based on thinking state
if (isThinking) {
thinkingContent += token;
// Handle content based on thinking state
if (isThinking) {
thinkingContent += token;
// Count tokens for rate calculation (per chat)
const request = activeRequests.get(chatId);
if (request) {
@@ -2088,42 +1995,7 @@ async function promptGPT(systemPrompt, input) {
}, 100);
}
} else {
// Not in thinking state, add to content buffer
contentBuffer.push(token);
// Track assistant message index for reasoning placement
if (lastAssistantMessageIndex === -1) {
const currentChat = chatStore.getChat(chatId);
if (currentChat) {
const targetHistory = currentChat.history;
// Find or create assistant message index
for (let i = targetHistory.length - 1; i >= 0; i--) {
if (targetHistory[i].role === "assistant") {
lastAssistantMessageIndex = i;
break;
}
}
// If no assistant message yet, it will be created when we flush contentBuffer
}
}
}
} else {
// Receiving reasoning from API, just add token to content buffer
contentBuffer.push(token);
// Track assistant message index for reasoning placement
if (lastAssistantMessageIndex === -1) {
const currentChat = chatStore.getChat(chatId);
if (currentChat) {
const targetHistory = currentChat.history;
// Find or create assistant message index
for (let i = targetHistory.length - 1; i >= 0; i--) {
if (targetHistory[i].role === "assistant") {
lastAssistantMessageIndex = i;
break;
}
}
// If no assistant message yet, it will be created when we flush contentBuffer
}
}
}
}
} catch (error) {
@@ -2135,17 +2007,6 @@ async function promptGPT(systemPrompt, input) {
// Efficiently update the chat in batch
if (contentBuffer.length > 0) {
addToChat(contentBuffer.join(""));
// Update assistant message index after adding content
const currentChat = chatStore.getChat(chatId);
if (currentChat) {
const targetHistory = currentChat.history;
for (let i = targetHistory.length - 1; i >= 0; i--) {
if (targetHistory[i].role === "assistant") {
lastAssistantMessageIndex = i;
break;
}
}
}
contentBuffer = [];
// Scroll when assistant content is updated (this will also show thinking messages above)
setTimeout(() => {
@@ -2164,30 +2025,7 @@ async function promptGPT(systemPrompt, input) {
if (contentBuffer.length > 0) {
addToChat(contentBuffer.join(""));
}
// Final reasoning flush if any data remains - always use "thinking" role
const finalChat = chatStore.getChat(chatId);
if (finalChat && reasoningContent.trim() && lastReasoningMessageIndex === -1) {
const isMCPMode = finalChat.mcpMode || false;
const shouldExpand = !isMCPMode;
const targetHistory = finalChat.history;
// Find assistant message to insert before
const assistantIndex = targetHistory.length - 1;
if (assistantIndex >= 0 && targetHistory[assistantIndex]?.role === "assistant") {
targetHistory.splice(assistantIndex, 0, {
role: "thinking",
content: reasoningContent,
html: DOMPurify.sanitize(marked.parse(reasoningContent)),
image: [],
audio: [],
expanded: shouldExpand
});
} else {
chatStore.add("thinking", reasoningContent, null, null, chatId);
}
}
// Final thinking content flush (legacy tag-based thinking)
if (finalChat && thinkingContent.trim() && lastThinkingMessageIndex === -1) {
chatStore.add("thinking", thinkingContent, null, null, chatId);
}

View File

@@ -54,11 +54,6 @@
<span class="font-semibold text-cyan-300" x-text="installedBackends"></span>
<span class="text-[#94A3B8] ml-1">installed</span>
</a>
<div class="flex items-center bg-[#101827] rounded-lg px-4 py-2 border border-[#38BDF8]/30">
<i class="fas fa-microchip text-[#38BDF8] mr-2"></i>
<span class="text-[#94A3B8] mr-1">Capability:</span>
<span class="font-semibold text-[#38BDF8]" x-text="systemCapability"></span>
</div>
<a href="https://localai.io/backends/" target="_blank" class="btn-primary">
<i class="fas fa-info-circle mr-2"></i>
<span>Documentation</span>
@@ -593,7 +588,6 @@ function backendsGallery() {
totalPages: 1,
availableBackends: 0,
installedBackends: 0,
systemCapability: '',
selectedBackend: null,
jobProgress: {},
notifications: [],
@@ -689,7 +683,6 @@ function backendsGallery() {
this.totalPages = data.totalPages || 1;
this.availableBackends = data.availableBackends || 0;
this.installedBackends = data.installedBackends || 0;
this.systemCapability = data.systemCapability || 'default';
} catch (error) {
console.error('Error fetching backends:', error);
} finally {

View File

@@ -41,7 +41,7 @@ SOFTWARE.
__chatContextSize = {{ .ContextSize }};
{{ end }}
// Store gallery configs for header icon display and model info modal
// Store gallery configs for header icon display
window.__galleryConfigs = {};
{{ $allGalleryConfigs:=.GalleryConfig }}
{{ range $modelName, $galleryConfig := $allGalleryConfigs }}
@@ -49,16 +49,6 @@ SOFTWARE.
{{ if $galleryConfig.Icon }}
window.__galleryConfigs["{{$modelName}}"].Icon = "{{$galleryConfig.Icon}}";
{{ end }}
{{ if $galleryConfig.Description }}
window.__galleryConfigs["{{$modelName}}"].Description = {{ printf "%q" $galleryConfig.Description }};
{{ end }}
{{ if $galleryConfig.URLs }}
window.__galleryConfigs["{{$modelName}}"].URLs = [
{{ range $idx, $url := $galleryConfig.URLs }}
{{ if $idx }},{{ end }}{{ printf "%q" $url }}
{{ end }}
];
{{ end }}
{{ end }}
// Function to initialize store
@@ -336,10 +326,10 @@ SOFTWARE.
c += DOMPurify.sanitize(marked.parse(line));
});
}
// Set expanded state: thinking and reasoning are expanded by default in non-MCP mode, collapsed in MCP mode
// tool_call and tool_result are always collapsed by default
// Set expanded state: thinking is expanded by default in non-MCP mode, collapsed in MCP mode
// Reasoning, tool_call, and tool_result are always collapsed by default
const isMCPMode = chat.mcpMode || false;
const shouldExpand = ((role === "thinking" || role === "reasoning") && !isMCPMode) || false;
const shouldExpand = (role === "thinking" && !isMCPMode) || false;
chat.history.push({ role, content, html: c, image, audio, expanded: shouldExpand, model: messageModel });
// Auto-name chat from first user message
@@ -507,11 +497,6 @@ SOFTWARE.
activeChat.model = modelName;
activeChat.updatedAt = Date.now();
// Update model info modal with new model
if (window.updateModelInfoModal) {
window.updateModelInfoModal(modelName);
}
// Get context size from data attribute
let contextSize = null;
if (selectedOption.dataset.contextSize) {
@@ -551,23 +536,18 @@ SOFTWARE.
}
// Update model selector to reflect the change (ensure it stays in sync)
// Note: We don't dispatch a change event here to avoid infinite loop
// The selector is already updated via user interaction or programmatic change
const modelSelector = document.getElementById('modelSelector');
if (modelSelector) {
// Find and select the option matching the model
const optionValue = 'chat/' + modelName;
for (let i = 0; i < modelSelector.options.length; i++) {
if (modelSelector.options[i].value === optionValue) {
// Only update if it's different to avoid unnecessary updates
if (modelSelector.selectedIndex !== i) {
modelSelector.selectedIndex = i;
}
modelSelector.selectedIndex = i;
break;
}
}
// Don't dispatch change event here - it would cause infinite recursion
// The selector is already in sync with the model
// Trigger Alpine reactivity by dispatching change event
modelSelector.dispatchEvent(new Event('change', { bubbles: true }));
}
// Trigger MCP availability check in Alpine component
@@ -623,52 +603,27 @@ SOFTWARE.
<div class="flex items-center justify-between gap-2">
<label class="text-xs font-medium text-[var(--color-text-secondary)] uppercase tracking-wide flex-shrink-0">Model</label>
<div class="flex items-center gap-1 flex-shrink-0">
<!-- Info button - reactive to active chat model -->
<template x-if="$store.chat.activeChat() && $store.chat.activeChat().model && window.__galleryConfigs && window.__galleryConfigs[$store.chat.activeChat().model]">
<button
data-twe-ripple-init
data-twe-ripple-color="light"
class="text-[var(--color-text-secondary)] hover:text-[var(--color-primary)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]"
data-modal-target="model-info-modal"
data-modal-toggle="model-info-modal"
:data-model-name="$store.chat.activeChat().model"
@click="if (window.updateModelInfoModal) { window.updateModelInfoModal($store.chat.activeChat().model, true); }"
title="Model Information">
<i class="fas fa-info-circle"></i>
</button>
</template>
<!-- Fallback info button for initial model from server -->
<template x-if="(!$store.chat.activeChat() || !$store.chat.activeChat().model) && window.__galleryConfigs && window.__galleryConfigs['{{$model}}']">
<button
data-twe-ripple-init
data-twe-ripple-color="light"
class="text-[var(--color-text-secondary)] hover:text-[var(--color-primary)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]"
data-modal-target="model-info-modal"
data-modal-toggle="model-info-modal"
data-model-name="{{$model}}"
@click="if (window.updateModelInfoModal) { window.updateModelInfoModal('{{$model}}', true); }"
title="Model Information">
<i class="fas fa-info-circle"></i>
</button>
</template>
<!-- Edit button - reactive to active chat model -->
<template x-if="$store.chat.activeChat() && $store.chat.activeChat().model">
<a :href="'/models/edit/' + $store.chat.activeChat().model"
class="text-[var(--color-text-secondary)] hover:text-[var(--color-warning)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]"
title="Edit Model Configuration">
<i class="fas fa-edit"></i>
</a>
</template>
<!-- Fallback edit button for initial model from server -->
<template x-if="!$store.chat.activeChat() || !$store.chat.activeChat().model">
{{ if $model }}
<a href="/models/edit/{{$model}}"
class="text-[var(--color-text-secondary)] hover:text-[var(--color-warning)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]"
title="Edit Model Configuration">
<i class="fas fa-edit"></i>
</a>
{{ end }}
</template>
{{ if $model }}
{{ $galleryConfig:= index $allGalleryConfigs $model}}
{{ if $galleryConfig }}
<button
data-twe-ripple-init
data-twe-ripple-color="light"
class="text-[var(--color-text-secondary)] hover:text-[var(--color-primary)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]"
data-modal-target="model-info-modal"
data-modal-toggle="model-info-modal"
title="Model Information">
<i class="fas fa-info-circle"></i>
</button>
{{ end }}
{{ end }}
{{ if $model }}
<a href="/models/edit/{{$model}}"
class="text-[var(--color-text-secondary)] hover:text-[var(--color-warning)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]"
title="Edit Model Configuration">
<i class="fas fa-edit"></i>
</a>
{{ end }}
</div>
</div>
<select
@@ -1533,14 +1488,17 @@ SOFTWARE.
</div>
</div>
<!-- Modal moved outside of sidebar to appear in center of page - Always available, content updated dynamically -->
<div id="model-info-modal" tabindex="-1" aria-hidden="true" class="hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 flex justify-center items-center w-full h-full md:inset-0 max-h-full" style="padding: 1rem;">
<!-- Modal moved outside of sidebar to appear in center of page -->
{{ if $model }}
{{ $galleryConfig:= index $allGalleryConfigs $model}}
{{ if $galleryConfig }}
<div id="model-info-modal" tabindex="-1" aria-hidden="true" class="hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 flex justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full">
<div class="relative p-4 w-full max-w-2xl max-h-full">
<div class="relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700">
<!-- Header -->
<div class="flex items-center justify-between p-4 md:p-5 border-b rounded-t dark:border-gray-600">
<h3 id="model-info-modal-title" class="text-xl font-semibold text-gray-900 dark:text-white">{{ if $model }}{{ $model }}{{ end }}</h3>
<button class="text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white" data-modal-hide="model-info-modal" @click="if (window.closeModelInfoModal) { window.closeModelInfoModal(); }">
<h3 class="text-xl font-semibold text-gray-900 dark:text-white">{{ $model }}</h3>
<button class="text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white" data-modal-hide="model-info-modal">
<svg class="w-3 h-3" aria-hidden="true" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 14 14">
<path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="m1 1 6 6m0 0 6 6M7 7l6-6M7 7l-6 6"/>
</svg>
@@ -1551,24 +1509,29 @@ SOFTWARE.
<!-- Body -->
<div class="p-4 md:p-5 space-y-4">
<div class="flex justify-center items-center">
<img id="model-info-modal-icon" class="lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded" style="display: none;" loading="lazy"/>
{{ if $galleryConfig.Icon }}<img class="lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded" src="{{$galleryConfig.Icon}}" loading="lazy"/>{{end}}
</div>
<div id="model-info-description" class="text-base leading-relaxed text-gray-500 dark:text-gray-400 break-words max-w-full"></div>
<div id="model-info-description" class="text-base leading-relaxed text-gray-500 dark:text-gray-400 break-words max-w-full">{{ $galleryConfig.Description }}</div>
<hr>
<p class="text-sm font-semibold text-gray-900 dark:text-white">Links</p>
<ul id="model-info-links">
<ul>
{{range $galleryConfig.URLs}}
<li><a href="{{ . }}" target="_blank">{{ . }}</a></li>
{{end}}
</ul>
</div>
<!-- Footer -->
<div class="flex items-center p-4 md:p-5 border-t border-gray-200 rounded-b dark:border-gray-600">
<button data-modal-hide="model-info-modal" class="py-2.5 px-5 ms-3 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-100 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700" @click="if (window.closeModelInfoModal) { window.closeModelInfoModal(); }">
<button data-modal-hide="model-info-modal" class="py-2.5 px-5 ms-3 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-100 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700">
Close
</button>
</div>
</div>
</div>
</div>
{{ end }}
{{ end }}
<!-- Alpine store initialization and utilities -->
<script>
@@ -1779,20 +1742,10 @@ SOFTWARE.
});
// Also listen for click events on modal toggle buttons
// Use event delegation to handle dynamically created buttons
document.addEventListener('click', (e) => {
const button = e.target.closest('[data-modal-toggle="model-info-modal"]');
if (button) {
// Update modal with current model before showing
if (window.Alpine && window.Alpine.store("chat")) {
const activeChat = window.Alpine.store("chat").activeChat();
const modelName = activeChat ? activeChat.model : (button.dataset.modelName || (document.getElementById("chat-model") ? document.getElementById("chat-model").value : null));
if (modelName && window.updateModelInfoModal) {
window.updateModelInfoModal(modelName, true);
}
}
document.querySelectorAll('[data-modal-toggle="model-info-modal"]').forEach(button => {
button.addEventListener('click', () => {
setTimeout(processMarkdown, 300);
}
});
});
// Process on initial load if libraries are ready
@@ -1833,176 +1786,12 @@ SOFTWARE.
syncModelSelectorOnLoad();
}
// Function to update model info modal with current model
// Set openModal to true to actually open the modal, false to just update content
window.updateModelInfoModal = function(modelName, openModal = false) {
if (!modelName) {
return;
}
if (!window.__galleryConfigs) {
return;
}
const galleryConfig = window.__galleryConfigs[modelName];
// Check if galleryConfig exists and has at least one property
if (!galleryConfig || Object.keys(galleryConfig).length === 0) {
// Still update the modal title even if no config, so user can see which model they clicked
const titleEl = document.getElementById('model-info-modal-title');
if (titleEl) {
titleEl.textContent = modelName;
}
// Show message that no info is available
const descEl = document.getElementById('model-info-description');
if (descEl) {
descEl.textContent = 'No additional information available for this model.';
}
const linksEl = document.getElementById('model-info-links');
if (linksEl) {
linksEl.innerHTML = '';
}
const iconEl = document.getElementById('model-info-modal-icon');
if (iconEl) {
iconEl.style.display = 'none';
}
// Only open the modal if explicitly requested
if (openModal) {
const modalElement = document.getElementById('model-info-modal');
if (modalElement) {
modalElement.classList.remove('hidden');
modalElement.setAttribute('aria-hidden', 'false');
// Add backdrop
let backdrop = document.querySelector('.modal-backdrop');
if (!backdrop) {
backdrop = document.createElement('div');
backdrop.className = 'modal-backdrop fixed inset-0 bg-gray-900 bg-opacity-50 dark:bg-opacity-80 z-40';
document.body.appendChild(backdrop);
backdrop.addEventListener('click', () => {
closeModelInfoModal();
});
}
}
}
return;
}
// Update modal title
const titleEl = document.getElementById('model-info-modal-title');
if (titleEl) {
titleEl.textContent = modelName;
}
// Update icon
const iconEl = document.getElementById('model-info-modal-icon');
if (iconEl) {
if (galleryConfig.Icon) {
iconEl.src = galleryConfig.Icon;
iconEl.style.display = 'block';
} else {
iconEl.style.display = 'none';
}
}
// Update description
const descEl = document.getElementById('model-info-description');
if (descEl) {
descEl.textContent = galleryConfig.Description || 'No description available.';
}
// Update links
const linksEl = document.getElementById('model-info-links');
if (linksEl && galleryConfig.URLs && Array.isArray(galleryConfig.URLs) && galleryConfig.URLs.length > 0) {
linksEl.innerHTML = '';
galleryConfig.URLs.forEach(url => {
const li = document.createElement('li');
const a = document.createElement('a');
a.href = url;
a.target = '_blank';
a.textContent = url;
li.appendChild(a);
linksEl.appendChild(li);
});
} else if (linksEl) {
linksEl.innerHTML = '<li>No links available</li>';
}
// Only open the modal if explicitly requested
if (openModal) {
const modalElement = document.getElementById('model-info-modal');
if (modalElement) {
// Ensure positioning classes are present (they might have been removed)
if (!modalElement.classList.contains('flex')) {
modalElement.classList.add('flex');
}
if (!modalElement.classList.contains('justify-center')) {
modalElement.classList.add('justify-center');
}
if (!modalElement.classList.contains('items-center')) {
modalElement.classList.add('items-center');
}
// Ensure fixed positioning
if (!modalElement.classList.contains('fixed')) {
modalElement.classList.add('fixed');
}
// Ensure full width and height
if (!modalElement.classList.contains('w-full')) {
modalElement.classList.add('w-full');
}
if (!modalElement.classList.contains('h-full')) {
modalElement.classList.add('h-full');
}
// Ensure padding is set
if (!modalElement.style.padding) {
modalElement.style.padding = '1rem';
}
// Remove hidden class if present
modalElement.classList.remove('hidden');
// Set aria-hidden to false
modalElement.setAttribute('aria-hidden', 'false');
// Add backdrop if needed
let backdrop = document.querySelector('.modal-backdrop');
if (!backdrop) {
backdrop = document.createElement('div');
backdrop.className = 'modal-backdrop fixed inset-0 bg-gray-900 bg-opacity-50 dark:bg-opacity-80 z-40';
document.body.appendChild(backdrop);
backdrop.addEventListener('click', () => {
window.closeModelInfoModal();
});
}
}
}
};
// Function to close the model info modal
window.closeModelInfoModal = function() {
const modalElement = document.getElementById('model-info-modal');
if (modalElement) {
modalElement.classList.add('hidden');
modalElement.setAttribute('aria-hidden', 'true');
}
const backdrop = document.querySelector('.modal-backdrop');
if (backdrop) {
backdrop.remove();
}
};
// Also sync after Alpine initializes (in case it runs after DOMContentLoaded)
function initializeModelInfo() {
syncModelSelectorOnLoad();
// Initialize model info modal content with current model (but don't open it)
if (window.updateModelInfoModal && window.Alpine && window.Alpine.store("chat")) {
const activeChat = window.Alpine.store("chat").activeChat();
const modelName = activeChat ? activeChat.model : (document.getElementById("chat-model") ? document.getElementById("chat-model").value : null);
if (modelName) {
window.updateModelInfoModal(modelName, false); // false = don't open, just update content
}
}
}
if (window.Alpine) {
Alpine.nextTick(initializeModelInfo);
Alpine.nextTick(syncModelSelectorOnLoad);
} else {
document.addEventListener('alpine:init', () => {
Alpine.nextTick(initializeModelInfo);
Alpine.nextTick(syncModelSelectorOnLoad);
});
}
</script>

View File

@@ -1,176 +0,0 @@
package schema
import (
"context"
"encoding/json"
)
// AnthropicRequest represents a request to the Anthropic Messages API
// https://docs.anthropic.com/claude/reference/messages_post
type AnthropicRequest struct {
Model string `json:"model"`
Messages []AnthropicMessage `json:"messages"`
MaxTokens int `json:"max_tokens"`
Metadata map[string]string `json:"metadata,omitempty"`
StopSequences []string `json:"stop_sequences,omitempty"`
Stream bool `json:"stream,omitempty"`
System string `json:"system,omitempty"`
Temperature *float64 `json:"temperature,omitempty"`
TopK *int `json:"top_k,omitempty"`
TopP *float64 `json:"top_p,omitempty"`
Tools []AnthropicTool `json:"tools,omitempty"`
ToolChoice interface{} `json:"tool_choice,omitempty"`
// Internal fields for request handling
Context context.Context `json:"-"`
Cancel context.CancelFunc `json:"-"`
}
// ModelName implements the LocalAIRequest interface
func (ar *AnthropicRequest) ModelName(s *string) string {
if s != nil {
ar.Model = *s
}
return ar.Model
}
// AnthropicTool represents a tool definition in the Anthropic format
type AnthropicTool struct {
Name string `json:"name"`
Description string `json:"description,omitempty"`
InputSchema map[string]interface{} `json:"input_schema"`
}
// AnthropicMessage represents a message in the Anthropic format
type AnthropicMessage struct {
Role string `json:"role"`
Content interface{} `json:"content"`
}
// AnthropicContentBlock represents a content block in an Anthropic message
type AnthropicContentBlock struct {
Type string `json:"type"`
Text string `json:"text,omitempty"`
Source *AnthropicImageSource `json:"source,omitempty"`
ID string `json:"id,omitempty"`
Name string `json:"name,omitempty"`
Input map[string]interface{} `json:"input,omitempty"`
ToolUseID string `json:"tool_use_id,omitempty"`
Content interface{} `json:"content,omitempty"`
IsError *bool `json:"is_error,omitempty"`
}
// AnthropicImageSource represents an image source in Anthropic format
type AnthropicImageSource struct {
Type string `json:"type"`
MediaType string `json:"media_type"`
Data string `json:"data"`
}
// AnthropicResponse represents a response from the Anthropic Messages API
type AnthropicResponse struct {
ID string `json:"id"`
Type string `json:"type"`
Role string `json:"role"`
Content []AnthropicContentBlock `json:"content"`
Model string `json:"model"`
StopReason *string `json:"stop_reason"`
StopSequence *string `json:"stop_sequence,omitempty"`
Usage AnthropicUsage `json:"usage"`
}
// AnthropicUsage represents token usage in Anthropic format
type AnthropicUsage struct {
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
}
// AnthropicStreamEvent represents a streaming event from the Anthropic API
type AnthropicStreamEvent struct {
Type string `json:"type"`
Index int `json:"index,omitempty"`
ContentBlock *AnthropicContentBlock `json:"content_block,omitempty"`
Delta *AnthropicStreamDelta `json:"delta,omitempty"`
Message *AnthropicStreamMessage `json:"message,omitempty"`
Usage *AnthropicUsage `json:"usage,omitempty"`
}
// AnthropicStreamDelta represents the delta in a streaming response
type AnthropicStreamDelta struct {
Type string `json:"type,omitempty"`
Text string `json:"text,omitempty"`
PartialJSON string `json:"partial_json,omitempty"`
StopReason *string `json:"stop_reason,omitempty"`
StopSequence *string `json:"stop_sequence,omitempty"`
}
// AnthropicStreamMessage represents the message object in streaming events
type AnthropicStreamMessage struct {
ID string `json:"id"`
Type string `json:"type"`
Role string `json:"role"`
Content []AnthropicContentBlock `json:"content"`
Model string `json:"model"`
StopReason *string `json:"stop_reason"`
StopSequence *string `json:"stop_sequence,omitempty"`
Usage AnthropicUsage `json:"usage"`
}
// AnthropicErrorResponse represents an error response from the Anthropic API
type AnthropicErrorResponse struct {
Type string `json:"type"`
Error AnthropicError `json:"error"`
}
// AnthropicError represents an error in the Anthropic format
type AnthropicError struct {
Type string `json:"type"`
Message string `json:"message"`
}
// GetStringContent extracts the string content from an AnthropicMessage
// Content can be either a string or an array of content blocks
func (m *AnthropicMessage) GetStringContent() string {
switch content := m.Content.(type) {
case string:
return content
case []interface{}:
var result string
for _, block := range content {
if blockMap, ok := block.(map[string]interface{}); ok {
if blockMap["type"] == "text" {
if text, ok := blockMap["text"].(string); ok {
result += text
}
}
}
}
return result
}
return ""
}
// GetContentBlocks extracts content blocks from an AnthropicMessage
func (m *AnthropicMessage) GetContentBlocks() []AnthropicContentBlock {
switch content := m.Content.(type) {
case string:
return []AnthropicContentBlock{{Type: "text", Text: content}}
case []interface{}:
var blocks []AnthropicContentBlock
for _, block := range content {
if blockMap, ok := block.(map[string]interface{}); ok {
cb := AnthropicContentBlock{}
data, err := json.Marshal(blockMap)
if err != nil {
continue
}
if err := json.Unmarshal(data, &cb); err != nil {
continue
}
blocks = append(blocks, cb)
}
}
return blocks
}
return nil
}

View File

@@ -1,216 +0,0 @@
package schema_test
import (
"encoding/json"
"github.com/mudler/LocalAI/core/schema"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("Anthropic Schema", func() {
Describe("AnthropicRequest", func() {
It("should unmarshal a valid request", func() {
jsonData := `{
"model": "claude-3-sonnet-20240229",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world!"}
],
"system": "You are a helpful assistant.",
"temperature": 0.7
}`
var req schema.AnthropicRequest
err := json.Unmarshal([]byte(jsonData), &req)
Expect(err).ToNot(HaveOccurred())
Expect(req.Model).To(Equal("claude-3-sonnet-20240229"))
Expect(req.MaxTokens).To(Equal(1024))
Expect(len(req.Messages)).To(Equal(1))
Expect(req.System).To(Equal("You are a helpful assistant."))
Expect(*req.Temperature).To(Equal(0.7))
})
It("should unmarshal a request with tools", func() {
jsonData := `{
"model": "claude-3-sonnet-20240229",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "What's the weather?"}
],
"tools": [
{
"name": "get_weather",
"description": "Get the current weather",
"input_schema": {
"type": "object",
"properties": {
"location": {"type": "string"}
}
}
}
],
"tool_choice": {"type": "tool", "name": "get_weather"}
}`
var req schema.AnthropicRequest
err := json.Unmarshal([]byte(jsonData), &req)
Expect(err).ToNot(HaveOccurred())
Expect(len(req.Tools)).To(Equal(1))
Expect(req.Tools[0].Name).To(Equal("get_weather"))
Expect(req.Tools[0].Description).To(Equal("Get the current weather"))
Expect(req.ToolChoice).ToNot(BeNil())
})
It("should implement LocalAIRequest interface", func() {
req := &schema.AnthropicRequest{Model: "test-model"}
Expect(req.ModelName(nil)).To(Equal("test-model"))
newModel := "new-model"
Expect(req.ModelName(&newModel)).To(Equal("new-model"))
Expect(req.Model).To(Equal("new-model"))
})
})
Describe("AnthropicMessage", func() {
It("should get string content from string content", func() {
msg := schema.AnthropicMessage{
Role: "user",
Content: "Hello, world!",
}
Expect(msg.GetStringContent()).To(Equal("Hello, world!"))
})
It("should get string content from array content", func() {
msg := schema.AnthropicMessage{
Role: "user",
Content: []interface{}{
map[string]interface{}{"type": "text", "text": "Hello, "},
map[string]interface{}{"type": "text", "text": "world!"},
},
}
Expect(msg.GetStringContent()).To(Equal("Hello, world!"))
})
It("should get content blocks from string content", func() {
msg := schema.AnthropicMessage{
Role: "user",
Content: "Hello, world!",
}
blocks := msg.GetContentBlocks()
Expect(len(blocks)).To(Equal(1))
Expect(blocks[0].Type).To(Equal("text"))
Expect(blocks[0].Text).To(Equal("Hello, world!"))
})
It("should get content blocks from array content", func() {
msg := schema.AnthropicMessage{
Role: "user",
Content: []interface{}{
map[string]interface{}{"type": "text", "text": "Hello"},
map[string]interface{}{"type": "image", "source": map[string]interface{}{"type": "base64", "data": "abc123"}},
},
}
blocks := msg.GetContentBlocks()
Expect(len(blocks)).To(Equal(2))
Expect(blocks[0].Type).To(Equal("text"))
Expect(blocks[0].Text).To(Equal("Hello"))
})
})
Describe("AnthropicResponse", func() {
It("should marshal a valid response", func() {
stopReason := "end_turn"
resp := schema.AnthropicResponse{
ID: "msg_123",
Type: "message",
Role: "assistant",
Model: "claude-3-sonnet-20240229",
StopReason: &stopReason,
Content: []schema.AnthropicContentBlock{
{Type: "text", Text: "Hello!"},
},
Usage: schema.AnthropicUsage{
InputTokens: 10,
OutputTokens: 5,
},
}
data, err := json.Marshal(resp)
Expect(err).ToNot(HaveOccurred())
var result map[string]interface{}
err = json.Unmarshal(data, &result)
Expect(err).ToNot(HaveOccurred())
Expect(result["id"]).To(Equal("msg_123"))
Expect(result["type"]).To(Equal("message"))
Expect(result["role"]).To(Equal("assistant"))
Expect(result["stop_reason"]).To(Equal("end_turn"))
})
It("should marshal a response with tool use", func() {
stopReason := "tool_use"
resp := schema.AnthropicResponse{
ID: "msg_123",
Type: "message",
Role: "assistant",
Model: "claude-3-sonnet-20240229",
StopReason: &stopReason,
Content: []schema.AnthropicContentBlock{
{
Type: "tool_use",
ID: "toolu_123",
Name: "get_weather",
Input: map[string]interface{}{
"location": "San Francisco",
},
},
},
Usage: schema.AnthropicUsage{
InputTokens: 10,
OutputTokens: 5,
},
}
data, err := json.Marshal(resp)
Expect(err).ToNot(HaveOccurred())
var result map[string]interface{}
err = json.Unmarshal(data, &result)
Expect(err).ToNot(HaveOccurred())
Expect(result["stop_reason"]).To(Equal("tool_use"))
content := result["content"].([]interface{})
Expect(len(content)).To(Equal(1))
toolUse := content[0].(map[string]interface{})
Expect(toolUse["type"]).To(Equal("tool_use"))
Expect(toolUse["id"]).To(Equal("toolu_123"))
Expect(toolUse["name"]).To(Equal("get_weather"))
})
})
Describe("AnthropicErrorResponse", func() {
It("should marshal an error response", func() {
resp := schema.AnthropicErrorResponse{
Type: "error",
Error: schema.AnthropicError{
Type: "invalid_request_error",
Message: "max_tokens is required",
},
}
data, err := json.Marshal(resp)
Expect(err).ToNot(HaveOccurred())
var result map[string]interface{}
err = json.Unmarshal(data, &result)
Expect(err).ToNot(HaveOccurred())
Expect(result["type"]).To(Equal("error"))
errorObj := result["error"].(map[string]interface{})
Expect(errorObj["type"]).To(Equal("invalid_request_error"))
Expect(errorObj["message"]).To(Equal("max_tokens is required"))
})
})
})

View File

@@ -27,9 +27,6 @@ type Message struct {
FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty" yaml:"tool_call,omitempty"`
// Reasoning content extracted from <thinking>...</thinking> tags
Reasoning *string `json:"reasoning,omitempty" yaml:"reasoning,omitempty"`
}
type ToolCall struct {
@@ -81,8 +78,8 @@ func (messages Messages) ToProto() []*proto.Message {
}
}
// Note: tool_call_id is not in schema.Message yet
// Reasoning field is now available in schema.Message but not yet in proto.Message
// Note: tool_call_id and reasoning_content are not in schema.Message yet
// They may need to be added to schema.Message if needed in the future
}
return protoMessages
}

View File

@@ -1,56 +1,4 @@
---
- name: "qwen3-vl-reranker-8b"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:
- https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF
description: |
**Model Name:** Qwen3-VL-Reranker-8B
**Base Model:** Qwen/Qwen3-VL-Reranker-8B
**Description:**
A high-performance multimodal reranking model for state-of-the-art cross-modal search. It supports 30+ languages and handles text, images, screenshots, videos, and mixed modalities. With 8B parameters and a 32K context length, it refines retrieval results by combining embedding vectors with precise relevance scores. Optimized for efficiency, it supports quantized versions (e.g., Q8_0, Q4_K_M) and is ideal for applications requiring accurate multimodal content matching.
**Key Features:**
- **Multimodal**: Text, images, videos, and mixed content.
- **Language Support**: 30+ languages.
- **Quantization**: Available in Q8_0 (best quality), Q4_K_M (fast, recommended), and lower-precision options.
- **Performance**: Outperforms base models in retrieval tasks (e.g., JinaVDR, ViDoRe v3).
- **Use Case**: Enhances search pipelines by refining embeddings with precise relevance scores.
**Downloads:**
- [GGUF Files](https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF) (e.g., `Qwen3-VL-Reranker-8B.Q8_0.gguf`).
**Usage:**
- Requires `transformers`, `qwen-vl-utils`, and `torch`.
- Example: `from scripts.qwen3_vl_reranker import Qwen3VLReranker; model = Qwen3VLReranker(...)`
**Citation:**
@article{qwen3vlembedding, ...}
This description emphasizes its capabilities, efficiency, and versatility for multimodal search tasks.
overrides:
parameters:
model: llama-cpp/models/Qwen3-VL-Reranker-8B.Q4_K_M.gguf
name: Qwen3-VL-Reranker-8B-GGUF
backend: llama-cpp
template:
use_tokenizer_template: true
known_usecases:
- chat
function:
grammar:
disable: true
mmproj: llama-cpp/mmproj/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
description: Imported from https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF
options:
- use_jinja:true
files:
- filename: llama-cpp/models/Qwen3-VL-Reranker-8B.Q4_K_M.gguf
sha256: f73e62ea68abf741c3e713af823cfb4d2fd2ca35c8b68277b87b4b3d8570b66d
uri: https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF/resolve/main/Qwen3-VL-Reranker-8B.Q4_K_M.gguf
- filename: llama-cpp/mmproj/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
sha256: 15cd9bd4882dae771344f0ac204fce07de91b47c1438ada3861dfc817403c31e
uri: https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF/resolve/main/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
- name: "liquidai.lfm2-2.6b-transcript"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:
@@ -6163,7 +6111,6 @@
tags:
- embeddings
overrides:
backend: llama-cpp
embeddings: true
parameters:
model: granite-embedding-107m-multilingual-f16.gguf

7
go.mod
View File

@@ -9,7 +9,6 @@ require (
fyne.io/fyne/v2 v2.7.1
github.com/Masterminds/sprig/v3 v3.3.0
github.com/alecthomas/kong v1.13.0
github.com/anthropics/anthropic-sdk-go v1.19.0
github.com/charmbracelet/glamour v0.10.0
github.com/containerd/containerd v1.7.30
github.com/ebitengine/purego v0.9.1
@@ -59,7 +58,6 @@ require (
go.opentelemetry.io/otel/metric v1.39.0
go.opentelemetry.io/otel/sdk/metric v1.39.0
google.golang.org/grpc v1.78.0
google.golang.org/protobuf v1.36.10
gopkg.in/yaml.v2 v2.4.0
gopkg.in/yaml.v3 v3.0.1
oras.land/oras-go/v2 v2.6.0
@@ -69,11 +67,8 @@ require (
github.com/ghodss/yaml v1.0.0 // indirect
github.com/labstack/gommon v0.4.2 // indirect
github.com/swaggo/files/v2 v2.0.2 // indirect
github.com/tidwall/gjson v1.18.0 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
github.com/tidwall/sjson v1.2.5 // indirect
github.com/valyala/fasttemplate v1.2.2 // indirect
google.golang.org/protobuf v1.36.10 // indirect
)
require (

4
go.sum
View File

@@ -44,8 +44,6 @@ github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c=
github.com/anthropics/anthropic-sdk-go v1.19.0 h1:mO6E+ffSzLRvR/YUH9KJC0uGw0uV8GjISIuzem//3KE=
github.com/anthropics/anthropic-sdk-go v1.19.0/go.mod h1:WTz31rIUHUHqai2UslPpw5CwXrQP3geYBioRV4WOLvE=
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8=
@@ -764,12 +762,10 @@ github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4
github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA=
github.com/testcontainers/testcontainers-go v0.40.0 h1:pSdJYLOVgLE8YdUY2FHQ1Fxu+aMnb6JfVz1mxk7OeMU=
github.com/testcontainers/testcontainers-go v0.40.0/go.mod h1:FSXV5KQtX2HAMlm7U3APNyLkkap35zNLxukw9oBi/MY=
github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=

View File

@@ -1,114 +0,0 @@
package functions
import (
"strings"
)
// ExtractReasoning extracts reasoning content from thinking tags and returns
// both the extracted reasoning and the cleaned content (with tags removed).
// It handles <thinking>...</thinking> and <think>...</think> tags.
// Multiple reasoning blocks are concatenated with newlines.
func ExtractReasoning(content string) (reasoning string, cleanedContent string) {
if content == "" {
return "", content
}
var reasoningParts []string
var cleanedParts []string
remaining := content
// Define tag pairs to look for
tagPairs := []struct {
start string
end string
}{
{"<thinking>", "</thinking>"},
{"<think>", "</think>"},
}
// Track the last position we've processed
lastPos := 0
for {
// Find the earliest tag start
earliestStart := -1
earliestEnd := -1
isUnclosed := false
var matchedTag struct {
start string
end string
}
for _, tagPair := range tagPairs {
startIdx := strings.Index(remaining[lastPos:], tagPair.start)
if startIdx == -1 {
continue
}
startIdx += lastPos
// Find the corresponding end tag
endIdx := strings.Index(remaining[startIdx+len(tagPair.start):], tagPair.end)
if endIdx == -1 {
// Unclosed tag - extract what we have
if earliestStart == -1 || startIdx < earliestStart {
earliestStart = startIdx
earliestEnd = len(remaining)
isUnclosed = true
matchedTag = tagPair
}
continue
}
endIdx += startIdx + len(tagPair.start)
// Found a complete tag pair
if earliestStart == -1 || startIdx < earliestStart {
earliestStart = startIdx
earliestEnd = endIdx + len(tagPair.end)
isUnclosed = false
matchedTag = tagPair
}
}
if earliestStart == -1 {
// No more tags found, add remaining content
if lastPos < len(remaining) {
cleanedParts = append(cleanedParts, remaining[lastPos:])
}
break
}
// Add content before the tag
if earliestStart > lastPos {
cleanedParts = append(cleanedParts, remaining[lastPos:earliestStart])
}
// Extract reasoning content
reasoningStart := earliestStart + len(matchedTag.start)
// For unclosed tags, earliestEnd is already at the end of the string
// For closed tags, earliestEnd points to after the closing tag, so we subtract the end tag length
var reasoningEnd int
if isUnclosed {
// Unclosed tag - extract everything to the end
reasoningEnd = len(remaining)
} else {
// Closed tag - exclude the end tag
reasoningEnd = earliestEnd - len(matchedTag.end)
}
if reasoningEnd > reasoningStart {
reasoningContent := strings.TrimSpace(remaining[reasoningStart:reasoningEnd])
if reasoningContent != "" {
reasoningParts = append(reasoningParts, reasoningContent)
}
}
// Move past this tag
lastPos = earliestEnd
}
// Combine reasoning parts
reasoning = strings.Join(reasoningParts, "\n\n")
// Combine cleaned content parts
cleanedContent = strings.Join(cleanedParts, "")
return reasoning, cleanedContent
}

View File

@@ -1,261 +0,0 @@
package functions_test
import (
"strings"
. "github.com/mudler/LocalAI/pkg/functions"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("ExtractReasoning", func() {
Context("when content has no reasoning tags", func() {
It("should return empty reasoning and original content", func() {
content := "This is regular content without any tags."
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(BeEmpty())
Expect(cleaned).To(Equal(content))
})
It("should handle empty string", func() {
content := ""
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(BeEmpty())
Expect(cleaned).To(BeEmpty())
})
It("should handle content with only whitespace", func() {
content := " \n\t "
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(BeEmpty())
Expect(cleaned).To(Equal(content))
})
})
Context("when content has <thinking> tags", func() {
It("should extract reasoning from single thinking block", func() {
content := "Some text <thinking>This is my reasoning</thinking> More text"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("This is my reasoning"))
Expect(cleaned).To(Equal("Some text More text"))
})
It("should extract reasoning and preserve surrounding content", func() {
content := "Before <thinking>Reasoning here</thinking> After"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Reasoning here"))
Expect(cleaned).To(Equal("Before After"))
})
It("should handle thinking block at the start", func() {
content := "<thinking>Start reasoning</thinking> Regular content"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Start reasoning"))
Expect(cleaned).To(Equal(" Regular content"))
})
It("should handle thinking block at the end", func() {
content := "Regular content <thinking>End reasoning</thinking>"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("End reasoning"))
Expect(cleaned).To(Equal("Regular content "))
})
It("should handle only thinking block", func() {
content := "<thinking>Only reasoning</thinking>"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Only reasoning"))
Expect(cleaned).To(BeEmpty())
})
It("should trim whitespace from reasoning content", func() {
content := "Text <thinking> \n Reasoning with spaces \n </thinking> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Reasoning with spaces"))
Expect(cleaned).To(Equal("Text More"))
})
})
Context("when content has <think> tags", func() {
It("should extract reasoning from redacted_reasoning block", func() {
content := "Text <think>Redacted reasoning</think> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Redacted reasoning"))
Expect(cleaned).To(Equal("Text More"))
})
It("should handle redacted_reasoning with multiline content", func() {
content := "Before <think>Line 1\nLine 2\nLine 3</think> After"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Line 1\nLine 2\nLine 3"))
Expect(cleaned).To(Equal("Before After"))
})
It("should handle redacted_reasoning with complex content", func() {
content := "Start <think>Complex reasoning\nwith\nmultiple\nlines</think> End"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Complex reasoning\nwith\nmultiple\nlines"))
Expect(cleaned).To(Equal("Start End"))
})
})
Context("when content has multiple reasoning blocks", func() {
It("should concatenate multiple thinking blocks with newlines", func() {
content := "Text <thinking>First</thinking> Middle <thinking>Second</thinking> End"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("First\n\nSecond"))
Expect(cleaned).To(Equal("Text Middle End"))
})
It("should handle multiple different tag types", func() {
content := "A <thinking>One</thinking> B <think>Two</think> C <think>Three</think> D"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(ContainSubstring("One"))
Expect(reasoning).To(ContainSubstring("Two"))
Expect(reasoning).To(ContainSubstring("Three"))
Expect(cleaned).To(Equal("A B C D"))
})
It("should handle nested tags correctly (extracts first match)", func() {
content := "Text <thinking>Outer <think>Inner</think></thinking> More"
reasoning, cleaned := ExtractReasoning(content)
// Should extract the outer thinking block
Expect(reasoning).To(ContainSubstring("Outer"))
Expect(reasoning).To(ContainSubstring("Inner"))
Expect(cleaned).To(Equal("Text More"))
})
})
Context("when content has unclosed reasoning tags", func() {
It("should extract unclosed thinking block", func() {
content := "Text <thinking>Unclosed reasoning"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Unclosed reasoning"))
Expect(cleaned).To(Equal("Text "))
})
It("should extract unclosed think block", func() {
content := "Before <think>Incomplete"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Incomplete"))
Expect(cleaned).To(Equal("Before "))
})
It("should extract unclosed redacted_reasoning block", func() {
content := "Start <think>Partial reasoning content"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Partial reasoning content"))
Expect(cleaned).To(Equal("Start "))
})
It("should handle unclosed tag at the end", func() {
content := "Regular content <thinking>Unclosed at end"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Unclosed at end"))
Expect(cleaned).To(Equal("Regular content "))
})
})
Context("when content has empty reasoning blocks", func() {
It("should ignore empty thinking block", func() {
content := "Text <thinking></thinking> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(BeEmpty())
Expect(cleaned).To(Equal("Text More"))
})
It("should ignore thinking block with only whitespace", func() {
content := "Text <thinking> \n\t </thinking> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(BeEmpty())
Expect(cleaned).To(Equal("Text More"))
})
})
Context("when content has reasoning tags with special characters", func() {
It("should handle reasoning with newlines", func() {
content := "Before <thinking>Line 1\nLine 2\nLine 3</thinking> After"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Line 1\nLine 2\nLine 3"))
Expect(cleaned).To(Equal("Before After"))
})
It("should handle reasoning with code blocks", func() {
content := "Text <thinking>Reasoning with ```code``` blocks</thinking> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Reasoning with ```code``` blocks"))
Expect(cleaned).To(Equal("Text More"))
})
It("should handle reasoning with JSON", func() {
content := "Before <think>{\"key\": \"value\"}</think> After"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("{\"key\": \"value\"}"))
Expect(cleaned).To(Equal("Before After"))
})
It("should handle reasoning with HTML-like content", func() {
content := "Text <thinking>Reasoning with <tags> inside</thinking> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Reasoning with <tags> inside"))
Expect(cleaned).To(Equal("Text More"))
})
})
Context("when content has reasoning mixed with regular content", func() {
It("should preserve content order correctly", func() {
content := "Start <thinking>Reasoning</thinking> Middle <think>More reasoning</think> End"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(ContainSubstring("Reasoning"))
Expect(reasoning).To(ContainSubstring("More reasoning"))
Expect(cleaned).To(Equal("Start Middle End"))
})
It("should handle reasoning in the middle of a sentence", func() {
content := "This is a <thinking>reasoning</thinking> sentence."
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("reasoning"))
Expect(cleaned).To(Equal("This is a sentence."))
})
})
Context("edge cases", func() {
It("should handle content with only opening tag", func() {
content := "<thinking>"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(BeEmpty())
Expect(cleaned).To(Equal(""))
})
It("should handle content with only closing tag", func() {
content := "</thinking>"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(BeEmpty())
Expect(cleaned).To(Equal("</thinking>"))
})
It("should handle mismatched tags", func() {
content := "<thinking>Content</think>"
reasoning, cleaned := ExtractReasoning(content)
// Should extract unclosed thinking block
Expect(reasoning).To(ContainSubstring("Content"))
Expect(cleaned).To(Equal(""))
})
It("should handle very long reasoning content", func() {
longReasoning := strings.Repeat("This is reasoning content. ", 100)
content := "Text <thinking>" + longReasoning + "</thinking> More"
reasoning, cleaned := ExtractReasoning(content)
// TrimSpace is applied, so we need to account for that
Expect(reasoning).To(Equal(strings.TrimSpace(longReasoning)))
Expect(cleaned).To(Equal("Text More"))
})
It("should handle reasoning with unicode characters", func() {
content := "Text <thinking>Reasoning with 中文 and emoji 🧠</thinking> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Reasoning with 中文 and emoji 🧠"))
Expect(cleaned).To(Equal("Text More"))
})
})
})

View File

@@ -24,6 +24,8 @@ func (ml *ModelLoader) deleteProcess(s string) error {
return fmt.Errorf("model %s not found", s)
}
defer delete(ml.models, s)
retries := 1
for model.GRPC(false, ml.wd).IsBusy() {
xlog.Debug("Model busy. Waiting.", "model", s)
@@ -46,7 +48,6 @@ func (ml *ModelLoader) deleteProcess(s string) error {
if process == nil {
xlog.Error("No process", "model", s)
// Nothing to do as there is no process
delete(ml.models, s)
return nil
}
@@ -55,10 +56,6 @@ func (ml *ModelLoader) deleteProcess(s string) error {
xlog.Error("(deleteProcess) error while deleting process", "error", err, "model", s)
}
if err == nil {
delete(ml.models, s)
}
return err
}

View File

@@ -12,17 +12,15 @@ import (
)
const (
// Public constants - used by tests and external packages
Nvidia = "nvidia"
AMD = "amd"
Intel = "intel"
// Private constants - only used within this package
defaultCapability = "default"
nvidiaL4T = "nvidia-l4t"
darwinX86 = "darwin-x86"
metal = "metal"
vulkan = "vulkan"
nvidia = "nvidia"
amd = "amd"
intel = "intel"
vulkan = "vulkan"
nvidiaCuda13 = "nvidia-cuda-13"
nvidiaCuda12 = "nvidia-cuda-12"
@@ -32,16 +30,6 @@ const (
capabilityEnv = "LOCALAI_FORCE_META_BACKEND_CAPABILITY"
capabilityRunFileEnv = "LOCALAI_FORCE_META_BACKEND_CAPABILITY_RUN_FILE"
defaultRunFile = "/run/localai/capability"
// Backend detection tokens (private)
backendTokenDarwin = "darwin"
backendTokenMLX = "mlx"
backendTokenMetal = "metal"
backendTokenL4T = "l4t"
backendTokenCUDA = "cuda"
backendTokenROCM = "rocm"
backendTokenHIP = "hip"
backendTokenSYCL = "sycl"
)
var (
@@ -108,7 +96,7 @@ func (s *SystemState) getSystemCapabilities() string {
// If arm64 on linux and a nvidia gpu is detected, we will return nvidia-l4t
if runtime.GOOS == "linux" && runtime.GOARCH == "arm64" {
if s.GPUVendor == Nvidia {
if s.GPUVendor == nvidia {
xlog.Info("Using nvidia-l4t capability (arm64 on linux)", "env", capabilityEnv)
if cuda13DirExists {
return nvidiaL4TCuda13
@@ -143,6 +131,7 @@ func (s *SystemState) getSystemCapabilities() string {
return s.GPUVendor
}
// BackendPreferenceTokens returns a list of substrings that represent the preferred
// backend implementation order for the current system capability. Callers can use
// these tokens to select the most appropriate concrete backend among multiple
@@ -150,76 +139,19 @@ func (s *SystemState) getSystemCapabilities() string {
func (s *SystemState) BackendPreferenceTokens() []string {
capStr := strings.ToLower(s.getSystemCapabilities())
switch {
case strings.HasPrefix(capStr, Nvidia):
return []string{backendTokenCUDA, vulkan, "cpu"}
case strings.HasPrefix(capStr, AMD):
return []string{backendTokenROCM, backendTokenHIP, vulkan, "cpu"}
case strings.HasPrefix(capStr, Intel):
return []string{backendTokenSYCL, Intel, "cpu"}
case strings.HasPrefix(capStr, nvidia):
return []string{"cuda", "vulkan", "cpu"}
case strings.HasPrefix(capStr, amd):
return []string{"rocm", "hip", "vulkan", "cpu"}
case strings.HasPrefix(capStr, intel):
return []string{"sycl", intel, "cpu"}
case strings.HasPrefix(capStr, metal):
return []string{backendTokenMetal, "cpu"}
return []string{"metal", "cpu"}
case strings.HasPrefix(capStr, darwinX86):
return []string{"darwin-x86", "cpu"}
case strings.HasPrefix(capStr, vulkan):
return []string{vulkan, "cpu"}
return []string{"vulkan", "cpu"}
default:
return []string{"cpu"}
}
}
// DetectedCapability returns the detected system capability string.
// This can be used by the UI to display what capability was detected.
func (s *SystemState) DetectedCapability() string {
return s.getSystemCapabilities()
}
// IsBackendCompatible checks if a backend (identified by name and URI) is compatible
// with the current system capability. This function uses getSystemCapabilities to ensure
// consistency with capability detection (including VRAM checks, environment overrides, etc.).
func (s *SystemState) IsBackendCompatible(name, uri string) bool {
combined := strings.ToLower(name + " " + uri)
capability := s.getSystemCapabilities()
// Check for darwin/macOS-specific backends (mlx, metal, darwin)
isDarwinBackend := strings.Contains(combined, backendTokenDarwin) ||
strings.Contains(combined, backendTokenMLX) ||
strings.Contains(combined, backendTokenMetal)
if isDarwinBackend {
// Darwin backends require the system to be running on darwin with metal or darwin-x86 capability
return capability == metal || capability == darwinX86
}
// Check for NVIDIA L4T-specific backends (arm64 Linux with NVIDIA GPU)
// This must be checked before the general NVIDIA check as L4T backends
// may also contain "cuda" or "nvidia" in their names
isL4TBackend := strings.Contains(combined, backendTokenL4T)
if isL4TBackend {
return strings.HasPrefix(capability, nvidiaL4T)
}
// Check for NVIDIA/CUDA-specific backends (non-L4T)
isNvidiaBackend := strings.Contains(combined, backendTokenCUDA) ||
strings.Contains(combined, Nvidia)
if isNvidiaBackend {
// NVIDIA backends are compatible with nvidia, nvidia-cuda-12, nvidia-cuda-13, and l4t capabilities
return strings.HasPrefix(capability, Nvidia)
}
// Check for AMD/ROCm-specific backends
isAMDBackend := strings.Contains(combined, backendTokenROCM) ||
strings.Contains(combined, backendTokenHIP) ||
strings.Contains(combined, AMD)
if isAMDBackend {
return capability == AMD
}
// Check for Intel/SYCL-specific backends
isIntelBackend := strings.Contains(combined, backendTokenSYCL) ||
strings.Contains(combined, Intel)
if isIntelBackend {
return capability == Intel
}
// CPU backends are always compatible
return true
}

View File

@@ -1198,30 +1198,6 @@ const docTemplate = `{
}
}
},
"/v1/messages": {
"post": {
"summary": "Generate a message response for the given messages and model.",
"parameters": [
{
"description": "query params",
"name": "request",
"in": "body",
"required": true,
"schema": {
"$ref": "#/definitions/schema.AnthropicRequest"
}
}
],
"responses": {
"200": {
"description": "Response",
"schema": {
"$ref": "#/definitions/schema.AnthropicResponse"
}
}
}
}
},
"/v1/models": {
"get": {
"summary": "List and describe the various models available in the API.",
@@ -1763,169 +1739,6 @@ const docTemplate = `{
}
}
},
"schema.AnthropicContentBlock": {
"type": "object",
"properties": {
"content": {},
"id": {
"type": "string"
},
"input": {
"type": "object",
"additionalProperties": true
},
"is_error": {
"type": "boolean"
},
"name": {
"type": "string"
},
"source": {
"$ref": "#/definitions/schema.AnthropicImageSource"
},
"text": {
"type": "string"
},
"tool_use_id": {
"type": "string"
},
"type": {
"type": "string"
}
}
},
"schema.AnthropicImageSource": {
"type": "object",
"properties": {
"data": {
"type": "string"
},
"media_type": {
"type": "string"
},
"type": {
"type": "string"
}
}
},
"schema.AnthropicMessage": {
"type": "object",
"properties": {
"content": {},
"role": {
"type": "string"
}
}
},
"schema.AnthropicRequest": {
"type": "object",
"properties": {
"max_tokens": {
"type": "integer"
},
"messages": {
"type": "array",
"items": {
"$ref": "#/definitions/schema.AnthropicMessage"
}
},
"metadata": {
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"model": {
"type": "string"
},
"stop_sequences": {
"type": "array",
"items": {
"type": "string"
}
},
"stream": {
"type": "boolean"
},
"system": {
"type": "string"
},
"temperature": {
"type": "number"
},
"tool_choice": {},
"tools": {
"type": "array",
"items": {
"$ref": "#/definitions/schema.AnthropicTool"
}
},
"top_k": {
"type": "integer"
},
"top_p": {
"type": "number"
}
}
},
"schema.AnthropicResponse": {
"type": "object",
"properties": {
"content": {
"type": "array",
"items": {
"$ref": "#/definitions/schema.AnthropicContentBlock"
}
},
"id": {
"type": "string"
},
"model": {
"type": "string"
},
"role": {
"type": "string"
},
"stop_reason": {
"type": "string"
},
"stop_sequence": {
"type": "string"
},
"type": {
"type": "string"
},
"usage": {
"$ref": "#/definitions/schema.AnthropicUsage"
}
}
},
"schema.AnthropicTool": {
"type": "object",
"properties": {
"description": {
"type": "string"
},
"input_schema": {
"type": "object",
"additionalProperties": true
},
"name": {
"type": "string"
}
}
},
"schema.AnthropicUsage": {
"type": "object",
"properties": {
"input_tokens": {
"type": "integer"
},
"output_tokens": {
"type": "integer"
}
}
},
"schema.BackendMonitorRequest": {
"type": "object",
"properties": {
@@ -2416,10 +2229,6 @@ const docTemplate = `{
"description": "The message name (used for tools calls)",
"type": "string"
},
"reasoning": {
"description": "Reasoning content extracted from \u003cthinking\u003e...\u003c/thinking\u003e tags",
"type": "string"
},
"role": {
"description": "The message role",
"type": "string"

View File

@@ -1191,30 +1191,6 @@
}
}
},
"/v1/messages": {
"post": {
"summary": "Generate a message response for the given messages and model.",
"parameters": [
{
"description": "query params",
"name": "request",
"in": "body",
"required": true,
"schema": {
"$ref": "#/definitions/schema.AnthropicRequest"
}
}
],
"responses": {
"200": {
"description": "Response",
"schema": {
"$ref": "#/definitions/schema.AnthropicResponse"
}
}
}
}
},
"/v1/models": {
"get": {
"summary": "List and describe the various models available in the API.",
@@ -1756,169 +1732,6 @@
}
}
},
"schema.AnthropicContentBlock": {
"type": "object",
"properties": {
"content": {},
"id": {
"type": "string"
},
"input": {
"type": "object",
"additionalProperties": true
},
"is_error": {
"type": "boolean"
},
"name": {
"type": "string"
},
"source": {
"$ref": "#/definitions/schema.AnthropicImageSource"
},
"text": {
"type": "string"
},
"tool_use_id": {
"type": "string"
},
"type": {
"type": "string"
}
}
},
"schema.AnthropicImageSource": {
"type": "object",
"properties": {
"data": {
"type": "string"
},
"media_type": {
"type": "string"
},
"type": {
"type": "string"
}
}
},
"schema.AnthropicMessage": {
"type": "object",
"properties": {
"content": {},
"role": {
"type": "string"
}
}
},
"schema.AnthropicRequest": {
"type": "object",
"properties": {
"max_tokens": {
"type": "integer"
},
"messages": {
"type": "array",
"items": {
"$ref": "#/definitions/schema.AnthropicMessage"
}
},
"metadata": {
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"model": {
"type": "string"
},
"stop_sequences": {
"type": "array",
"items": {
"type": "string"
}
},
"stream": {
"type": "boolean"
},
"system": {
"type": "string"
},
"temperature": {
"type": "number"
},
"tool_choice": {},
"tools": {
"type": "array",
"items": {
"$ref": "#/definitions/schema.AnthropicTool"
}
},
"top_k": {
"type": "integer"
},
"top_p": {
"type": "number"
}
}
},
"schema.AnthropicResponse": {
"type": "object",
"properties": {
"content": {
"type": "array",
"items": {
"$ref": "#/definitions/schema.AnthropicContentBlock"
}
},
"id": {
"type": "string"
},
"model": {
"type": "string"
},
"role": {
"type": "string"
},
"stop_reason": {
"type": "string"
},
"stop_sequence": {
"type": "string"
},
"type": {
"type": "string"
},
"usage": {
"$ref": "#/definitions/schema.AnthropicUsage"
}
}
},
"schema.AnthropicTool": {
"type": "object",
"properties": {
"description": {
"type": "string"
},
"input_schema": {
"type": "object",
"additionalProperties": true
},
"name": {
"type": "string"
}
}
},
"schema.AnthropicUsage": {
"type": "object",
"properties": {
"input_tokens": {
"type": "integer"
},
"output_tokens": {
"type": "integer"
}
}
},
"schema.BackendMonitorRequest": {
"type": "object",
"properties": {
@@ -2409,10 +2222,6 @@
"description": "The message name (used for tools calls)",
"type": "string"
},
"reasoning": {
"description": "Reasoning content extracted from \u003cthinking\u003e...\u003c/thinking\u003e tags",
"type": "string"
},
"role": {
"description": "The message role",
"type": "string"

View File

@@ -239,114 +239,6 @@ definitions:
start:
type: number
type: object
schema.AnthropicContentBlock:
properties:
content: {}
id:
type: string
input:
additionalProperties: true
type: object
is_error:
type: boolean
name:
type: string
source:
$ref: '#/definitions/schema.AnthropicImageSource'
text:
type: string
tool_use_id:
type: string
type:
type: string
type: object
schema.AnthropicImageSource:
properties:
data:
type: string
media_type:
type: string
type:
type: string
type: object
schema.AnthropicMessage:
properties:
content: {}
role:
type: string
type: object
schema.AnthropicRequest:
properties:
max_tokens:
type: integer
messages:
items:
$ref: '#/definitions/schema.AnthropicMessage'
type: array
metadata:
additionalProperties:
type: string
type: object
model:
type: string
stop_sequences:
items:
type: string
type: array
stream:
type: boolean
system:
type: string
temperature:
type: number
tool_choice: {}
tools:
items:
$ref: '#/definitions/schema.AnthropicTool'
type: array
top_k:
type: integer
top_p:
type: number
type: object
schema.AnthropicResponse:
properties:
content:
items:
$ref: '#/definitions/schema.AnthropicContentBlock'
type: array
id:
type: string
model:
type: string
role:
type: string
stop_reason:
type: string
stop_sequence:
type: string
type:
type: string
usage:
$ref: '#/definitions/schema.AnthropicUsage'
type: object
schema.AnthropicTool:
properties:
description:
type: string
input_schema:
additionalProperties: true
type: object
name:
type: string
type: object
schema.AnthropicUsage:
properties:
input_tokens:
type: integer
output_tokens:
type: integer
type: object
schema.BackendMonitorRequest:
properties:
model:
@@ -681,9 +573,6 @@ definitions:
name:
description: The message name (used for tools calls)
type: string
reasoning:
description: Reasoning content extracted from <thinking>...</thinking> tags
type: string
role:
description: The message role
type: string
@@ -1924,21 +1813,6 @@ paths:
schema:
$ref: '#/definitions/schema.OpenAIResponse'
summary: Stream MCP chat completions with reasoning, tool calls, and results
/v1/messages:
post:
parameters:
- description: query params
in: body
name: request
required: true
schema:
$ref: '#/definitions/schema.AnthropicRequest'
responses:
"200":
description: Response
schema:
$ref: '#/definitions/schema.AnthropicResponse'
summary: Generate a message response for the given messages and model.
/v1/models:
get:
responses:

View File

@@ -1,375 +0,0 @@
package e2e_test
import (
"context"
"github.com/anthropics/anthropic-sdk-go"
"github.com/anthropics/anthropic-sdk-go/option"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("Anthropic API E2E test", func() {
var client anthropic.Client
Context("API with Anthropic SDK", func() {
BeforeEach(func() {
// Create Anthropic client pointing to LocalAI
client = anthropic.NewClient(
option.WithBaseURL(localAIURL),
option.WithAPIKey("test-api-key"), // LocalAI doesn't require a real API key
)
// Wait for API to be ready by attempting a simple request
Eventually(func() error {
_, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 10,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("Hi")),
},
})
return err
}, "2m").ShouldNot(HaveOccurred())
})
Context("Non-streaming responses", func() {
It("generates a response for a simple message", func() {
message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 1024,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("How much is 2+2? Reply with just the number.")),
},
})
Expect(err).ToNot(HaveOccurred())
Expect(message.Content).ToNot(BeEmpty())
// Role is a constant type that defaults to "assistant"
Expect(string(message.Role)).To(Equal("assistant"))
Expect(message.StopReason).To(Equal(anthropic.MessageStopReasonEndTurn))
Expect(string(message.Type)).To(Equal("message"))
// Check that content contains text block with expected answer
Expect(len(message.Content)).To(BeNumerically(">=", 1))
textBlock := message.Content[0]
Expect(string(textBlock.Type)).To(Equal("text"))
Expect(textBlock.Text).To(Or(ContainSubstring("4"), ContainSubstring("four")))
})
It("handles system prompts", func() {
message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 1024,
System: []anthropic.TextBlockParam{
{Text: "You are a helpful assistant. Always respond in uppercase letters."},
},
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("Say hello")),
},
})
Expect(err).ToNot(HaveOccurred())
Expect(message.Content).ToNot(BeEmpty())
Expect(len(message.Content)).To(BeNumerically(">=", 1))
})
It("returns usage information", func() {
message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 100,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("Hello")),
},
})
Expect(err).ToNot(HaveOccurred())
Expect(message.Usage.InputTokens).To(BeNumerically(">", 0))
Expect(message.Usage.OutputTokens).To(BeNumerically(">", 0))
})
})
Context("Streaming responses", func() {
It("streams tokens for a simple message", func() {
stream := client.Messages.NewStreaming(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 1024,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("Count from 1 to 5")),
},
})
message := anthropic.Message{}
eventCount := 0
hasContentDelta := false
for stream.Next() {
event := stream.Current()
err := message.Accumulate(event)
Expect(err).ToNot(HaveOccurred())
eventCount++
// Check for content block delta events
switch event.AsAny().(type) {
case anthropic.ContentBlockDeltaEvent:
hasContentDelta = true
}
}
Expect(stream.Err()).ToNot(HaveOccurred())
Expect(eventCount).To(BeNumerically(">", 0))
Expect(hasContentDelta).To(BeTrue())
// Check accumulated message
Expect(message.Content).ToNot(BeEmpty())
// Role is a constant type that defaults to "assistant"
Expect(string(message.Role)).To(Equal("assistant"))
})
It("streams with system prompt", func() {
stream := client.Messages.NewStreaming(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 1024,
System: []anthropic.TextBlockParam{
{Text: "You are a helpful assistant."},
},
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("Say hello")),
},
})
message := anthropic.Message{}
for stream.Next() {
event := stream.Current()
err := message.Accumulate(event)
Expect(err).ToNot(HaveOccurred())
}
Expect(stream.Err()).ToNot(HaveOccurred())
Expect(message.Content).ToNot(BeEmpty())
})
})
Context("Tool calling", func() {
It("handles tool calls in non-streaming mode", func() {
message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 1024,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("What's the weather like in San Francisco?")),
},
Tools: []anthropic.ToolParam{
{
Name: "get_weather",
Description: anthropic.F("Get the current weather in a given location"),
InputSchema: anthropic.F(map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"location": map[string]interface{}{
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
"required": []string{"location"},
}),
},
},
})
Expect(err).ToNot(HaveOccurred())
Expect(message.Content).ToNot(BeEmpty())
// The model must use tools - find the tool use in the response
hasToolUse := false
for _, block := range message.Content {
if block.Type == anthropic.ContentBlockTypeToolUse {
hasToolUse = true
Expect(block.Name).To(Equal("get_weather"))
Expect(block.ID).ToNot(BeEmpty())
// Verify that input contains location
inputMap, ok := block.Input.(map[string]interface{})
Expect(ok).To(BeTrue())
_, hasLocation := inputMap["location"]
Expect(hasLocation).To(BeTrue())
}
}
// Model must have called the tool
Expect(hasToolUse).To(BeTrue(), "Model should have called the get_weather tool")
Expect(message.StopReason).To(Equal(anthropic.MessageStopReasonToolUse))
})
It("handles tool_choice parameter", func() {
message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 1024,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("Tell me about the weather")),
},
Tools: []anthropic.ToolParam{
{
Name: "get_weather",
Description: anthropic.F("Get the current weather"),
InputSchema: anthropic.F(map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"location": map[string]interface{}{
"type": "string",
},
},
}),
},
},
ToolChoice: anthropic.F[anthropic.ToolChoiceUnionParam](
anthropic.ToolChoiceAutoParam{
Type: anthropic.F(anthropic.ToolChoiceAutoTypeAuto),
},
),
})
Expect(err).ToNot(HaveOccurred())
Expect(message.Content).ToNot(BeEmpty())
})
It("handles tool results in messages", func() {
// First, make a request that should trigger a tool call
firstMessage, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 1024,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("What's the weather in SF?")),
},
Tools: []anthropic.ToolParam{
{
Name: "get_weather",
Description: anthropic.F("Get weather"),
InputSchema: anthropic.F(map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"location": map[string]interface{}{"type": "string"},
},
}),
},
},
})
Expect(err).ToNot(HaveOccurred())
// Find the tool use block - model must call the tool
var toolUseID string
var toolUseName string
for _, block := range firstMessage.Content {
if block.Type == anthropic.ContentBlockTypeToolUse {
toolUseID = block.ID
toolUseName = block.Name
break
}
}
// Model must have called the tool
Expect(toolUseID).ToNot(BeEmpty(), "Model should have called the get_weather tool")
// Send back a tool result and verify it's handled correctly
secondMessage, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 1024,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("What's the weather in SF?")),
anthropic.NewAssistantMessage(firstMessage.Content...),
anthropic.NewUserMessage(
anthropic.NewToolResultBlock(toolUseID, "Sunny, 72°F", false),
),
},
Tools: []anthropic.ToolParam{
{
Name: toolUseName,
Description: anthropic.F("Get weather"),
InputSchema: anthropic.F(map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"location": map[string]interface{}{"type": "string"},
},
}),
},
},
})
Expect(err).ToNot(HaveOccurred())
Expect(secondMessage.Content).ToNot(BeEmpty())
})
It("handles tool calls in streaming mode", func() {
stream := client.Messages.NewStreaming(context.TODO(), anthropic.MessageNewParams{
Model: "gpt-4",
MaxTokens: 1024,
Messages: []anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock("What's the weather like in San Francisco?")),
},
Tools: []anthropic.ToolParam{
{
Name: "get_weather",
Description: anthropic.F("Get the current weather in a given location"),
InputSchema: anthropic.F(map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"location": map[string]interface{}{
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
"required": []string{"location"},
}),
},
},
})
message := anthropic.Message{}
eventCount := 0
hasToolUseBlock := false
hasContentBlockStart := false
hasContentBlockDelta := false
hasContentBlockStop := false
for stream.Next() {
event := stream.Current()
err := message.Accumulate(event)
Expect(err).ToNot(HaveOccurred())
eventCount++
// Check for different event types related to tool use
switch e := event.AsAny().(type) {
case anthropic.ContentBlockStartEvent:
hasContentBlockStart = true
if e.ContentBlock.Type == anthropic.ContentBlockTypeToolUse {
hasToolUseBlock = true
}
case anthropic.ContentBlockDeltaEvent:
hasContentBlockDelta = true
case anthropic.ContentBlockStopEvent:
hasContentBlockStop = true
}
}
Expect(stream.Err()).ToNot(HaveOccurred())
Expect(eventCount).To(BeNumerically(">", 0))
// Verify streaming events were emitted
Expect(hasContentBlockStart).To(BeTrue(), "Should have content_block_start event")
Expect(hasContentBlockDelta).To(BeTrue(), "Should have content_block_delta event")
Expect(hasContentBlockStop).To(BeTrue(), "Should have content_block_stop event")
// Check accumulated message has tool use
Expect(message.Content).ToNot(BeEmpty())
// Model must have called the tool
foundToolUse := false
for _, block := range message.Content {
if block.Type == anthropic.ContentBlockTypeToolUse {
foundToolUse = true
Expect(block.Name).To(Equal("get_weather"))
Expect(block.ID).ToNot(BeEmpty())
}
}
Expect(foundToolUse).To(BeTrue(), "Model should have called the get_weather tool in streaming mode")
Expect(message.StopReason).To(Equal(anthropic.MessageStopReasonToolUse))
})
})
})
})