Compare commits

...

9 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
1c073f6640 Initial plan 2026-01-10 00:10:07 +00:00
LocalAI [bot]
fdc2c0737c chore: ⬆️ Update ggml-org/llama.cpp to 593da7fa49503b68f9f01700be9f508f1e528992 (#7946)
⬆️ Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-01-09 21:13:04 +00:00
Ettore Di Giacinto
f4b0a304d7 chore(llama.cpp): propagate errors during model load (#7937)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-01-09 07:52:49 +01:00
Ettore Di Giacinto
d16ec7aa9e chore(deps): Bump llama.cpp to '480160d47297df43b43746294963476fc0a6e10f' (#7933)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-01-09 07:52:32 +01:00
Ettore Di Giacinto
d699b7ccdc Add backend configuration for Granite embedding model
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-01-09 00:44:10 +01:00
Ettore Di Giacinto
a4d224dd1b Revert "chore(uv): add --index-strategy=unsafe-first-match to l4t" (#7936)
Revert "chore(uv): add --index-strategy=unsafe-first-match to l4t (#7934)"

This reverts commit f5dee90962.
2026-01-08 23:31:51 +01:00
Ettore Di Giacinto
917c7aa9f3 chore(ci): roll back l4t-cuda12 configurations (#7935)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-01-08 23:04:33 +01:00
LocalAI [bot]
5aa66842dd chore: ⬆️ Update leejet/stable-diffusion.cpp to 0e52afc6513cc2dea9a1a017afc4a008d5acf2b0 (#7930)
⬆️ Update leejet/stable-diffusion.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-01-08 22:48:46 +01:00
Ettore Di Giacinto
f5dee90962 chore(uv): add --index-strategy=unsafe-first-match to l4t (#7934)
This is because the main index might not contain all the dependencies
for torch

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-01-08 22:48:03 +01:00
8 changed files with 118 additions and 25 deletions

View File

@@ -106,7 +106,7 @@ RUN <<EOT bash
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "arm64" = "$TARGETARCH" ]; then
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi

View File

@@ -94,7 +94,11 @@ RUN <<EOT bash
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
fi
if [ "arm64" = "$TARGETARCH" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
else
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
fi
fi
dpkg -i cuda-keyring_1.1-1_all.deb && \
rm -f cuda-keyring_1.1-1_all.deb && \
@@ -106,7 +110,7 @@ RUN <<EOT bash
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "arm64" = "$TARGETARCH" ]; then
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi

View File

@@ -148,11 +148,14 @@ RUN <<EOT bash
apt-get install -y --no-install-recommends \
software-properties-common pciutils
if [ "amd64" = "$TARGETARCH" ]; then
echo https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
fi
if [ "arm64" = "$TARGETARCH" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
else
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
fi
fi
dpkg -i cuda-keyring_1.1-1_all.deb && \
rm -f cuda-keyring_1.1-1_all.deb && \
@@ -164,7 +167,7 @@ RUN <<EOT bash
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "arm64" = "$TARGETARCH" ]; then
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi

View File

@@ -108,7 +108,11 @@ RUN <<EOT bash
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
fi
if [ "arm64" = "$TARGETARCH" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
else
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
fi
fi
dpkg -i cuda-keyring_1.1-1_all.deb && \
rm -f cuda-keyring_1.1-1_all.deb && \
@@ -120,7 +124,7 @@ RUN <<EOT bash
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "arm64" = "$TARGETARCH" ]; then
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=ae9f8df77882716b1702df2bed8919499e64cc28
LLAMA_VERSION?=593da7fa49503b68f9f01700be9f508f1e528992
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=

View File

@@ -23,6 +23,7 @@
#include <grpcpp/health_check_service_interface.h>
#include <regex>
#include <atomic>
#include <mutex>
#include <signal.h>
#include <thread>
@@ -390,8 +391,9 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
// Initialize fit_params options (can be overridden by options)
// fit_params: whether to auto-adjust params to fit device memory (default: true as in llama.cpp)
params.fit_params = true;
// fit_params_target: target margin per device in bytes (default: 1GB)
params.fit_params_target = 1024 * 1024 * 1024;
// fit_params_target: target margin per device in bytes (default: 1GB per device)
// Initialize as vector with default value for all devices
params.fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024 * 1024);
// fit_params_min_ctx: minimum context size for fit (default: 4096)
params.fit_params_min_ctx = 4096;
@@ -468,10 +470,28 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
} else if (!strcmp(optname, "fit_params_target") || !strcmp(optname, "fit_target")) {
if (optval != NULL) {
try {
// Value is in MiB, convert to bytes
params.fit_params_target = static_cast<size_t>(std::stoi(optval_str)) * 1024 * 1024;
// Value is in MiB, can be comma-separated list for multiple devices
// Single value is broadcast across all devices
std::string arg_next = optval_str;
const std::regex regex{ R"([,/]+)" };
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
std::vector<std::string> split_arg{ it, {} };
if (split_arg.size() >= llama_max_devices()) {
// Too many values provided
continue;
}
if (split_arg.size() == 1) {
// Single value: broadcast to all devices
size_t value_mib = std::stoul(split_arg[0]);
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), value_mib * 1024 * 1024);
} else {
// Multiple values: set per device
for (size_t i = 0; i < split_arg.size() && i < params.fit_params_target.size(); i++) {
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024 * 1024;
}
}
} catch (const std::exception& e) {
// If conversion fails, keep default value (1GB)
// If conversion fails, keep default value (1GB per device)
}
}
} else if (!strcmp(optname, "fit_params_min_ctx") || !strcmp(optname, "fit_ctx")) {
@@ -686,13 +706,13 @@ private:
public:
BackendServiceImpl(server_context& ctx) : ctx_server(ctx) {}
grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) {
grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) override {
// Implement Health RPC
reply->set_message("OK");
return Status::OK;
}
grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) {
grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) override {
// Implement LoadModel RPC
common_params params;
params_parse(ctx_server, request, params);
@@ -709,11 +729,72 @@ public:
LOG_INF("\n");
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
LOG_INF("\n");
// Capture error messages during model loading
struct error_capture {
std::string captured_error;
std::mutex error_mutex;
ggml_log_callback original_callback;
void* original_user_data;
} error_capture_data;
// Get original log callback
llama_log_get(&error_capture_data.original_callback, &error_capture_data.original_user_data);
// Set custom callback to capture errors
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
auto* capture = static_cast<error_capture*>(user_data);
// Capture error messages
if (level == GGML_LOG_LEVEL_ERROR) {
std::lock_guard<std::mutex> lock(capture->error_mutex);
// Append error message, removing trailing newlines
std::string msg(text);
while (!msg.empty() && (msg.back() == '\n' || msg.back() == '\r')) {
msg.pop_back();
}
if (!msg.empty()) {
if (!capture->captured_error.empty()) {
capture->captured_error.append("; ");
}
capture->captured_error.append(msg);
}
}
// Also call original callback to preserve logging
if (capture->original_callback) {
capture->original_callback(level, text, capture->original_user_data);
}
}, &error_capture_data);
// load the model
if (!ctx_server.load_model(params)) {
result->set_message("Failed loading model");
bool load_success = ctx_server.load_model(params);
// Restore original log callback
llama_log_set(error_capture_data.original_callback, error_capture_data.original_user_data);
if (!load_success) {
std::string error_msg = "Failed to load model: " + params.model.path;
if (!params.mmproj.path.empty()) {
error_msg += " (with mmproj: " + params.mmproj.path + ")";
}
if (params.has_speculative() && !params.speculative.model.path.empty()) {
error_msg += " (with draft model: " + params.speculative.model.path + ")";
}
// Add captured error details if available
{
std::lock_guard<std::mutex> lock(error_capture_data.error_mutex);
if (!error_capture_data.captured_error.empty()) {
error_msg += ". Error: " + error_capture_data.captured_error;
} else {
error_msg += ". Model file may not exist or be invalid.";
}
}
result->set_message(error_msg);
result->set_success(false);
return Status::CANCELLED;
return grpc::Status(grpc::StatusCode::INTERNAL, error_msg);
}
// Process grammar triggers now that vocab is available
@@ -1492,7 +1573,7 @@ public:
return grpc::Status::OK;
}
grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) override {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
@@ -2163,7 +2244,7 @@ public:
return grpc::Status::OK;
}
grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) override {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
@@ -2258,7 +2339,7 @@ public:
return grpc::Status::OK;
}
grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) override {
if (!params_base.embedding || params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
return grpc::Status(grpc::StatusCode::UNIMPLEMENTED, "This server does not support reranking. Start it with `--reranking` and without `--embedding`");
}
@@ -2344,7 +2425,7 @@ public:
return grpc::Status::OK;
}
grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) {
grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) override {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
@@ -2367,7 +2448,7 @@ public:
return grpc::Status::OK;
}
grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) {
grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) override {
// request slots data using task queue
auto rd = ctx_server.get_response_reader();

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=9be0b91927dfa4007d053df72dea7302990226bb
STABLEDIFFUSION_GGML_VERSION?=0e52afc6513cc2dea9a1a017afc4a008d5acf2b0
CMAKE_ARGS+=-DGGML_MAX_NAME=128

View File

@@ -6111,6 +6111,7 @@
tags:
- embeddings
overrides:
backend: llama-cpp
embeddings: true
parameters:
model: granite-embedding-107m-multilingual-f16.gguf