chore(uv): add --index-strategy=unsafe-first-match to l4t

This is because the main index might not contain all the dependencies for torch Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-23 08:10:48 -04:00 · 2026-01-08 21:47:18 +00:00
13 changed files with 61 additions and 118 deletions
--- a/2
+++ b/2
@@ -106,7 +106,7 @@ RUN <<EOT bash
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
+        if [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -94,11 +94,7 @@ RUN <<EOT bash
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
@@ -110,7 +106,7 @@ RUN <<EOT bash
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
+        if [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
--- a/backend/Dockerfile.llama-cpp
+++ b/backend/Dockerfile.llama-cpp
@@ -148,14 +148,11 @@ RUN <<EOT bash
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
        if [ "amd64" = "$TARGETARCH" ]; then
+            echo https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
@@ -167,7 +164,7 @@ RUN <<EOT bash
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
+        if [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -108,11 +108,7 @@ RUN <<EOT bash
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
@@ -124,7 +120,7 @@ RUN <<EOT bash
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
+        if [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=593da7fa49503b68f9f01700be9f508f1e528992
+LLAMA_VERSION?=ae9f8df77882716b1702df2bed8919499e64cc28
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -23,7 +23,6 @@
 #include <grpcpp/health_check_service_interface.h>
 #include <regex>
 #include <atomic>
-#include <mutex>
 #include <signal.h>
 #include <thread>

@@ -391,9 +390,8 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
    // Initialize fit_params options (can be overridden by options)
    // fit_params: whether to auto-adjust params to fit device memory (default: true as in llama.cpp)
    params.fit_params = true;
-    // fit_params_target: target margin per device in bytes (default: 1GB per device)
-    // Initialize as vector with default value for all devices
-    params.fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024 * 1024);
+    // fit_params_target: target margin per device in bytes (default: 1GB)
+    params.fit_params_target = 1024 * 1024 * 1024;
    // fit_params_min_ctx: minimum context size for fit (default: 4096)
    params.fit_params_min_ctx = 4096;

@@ -470,28 +468,10 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
        } else if (!strcmp(optname, "fit_params_target") || !strcmp(optname, "fit_target")) {
            if (optval != NULL) {
                try {
-                    // Value is in MiB, can be comma-separated list for multiple devices
-                    // Single value is broadcast across all devices
-                    std::string arg_next = optval_str;
-                    const std::regex regex{ R"([,/]+)" };
-                    std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
-                    std::vector<std::string> split_arg{ it, {} };
-                    if (split_arg.size() >= llama_max_devices()) {
-                        // Too many values provided
-                        continue;
-                    }
-                    if (split_arg.size() == 1) {
-                        // Single value: broadcast to all devices
-                        size_t value_mib = std::stoul(split_arg[0]);
-                        std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), value_mib * 1024 * 1024);
-                    } else {
-                        // Multiple values: set per device
-                        for (size_t i = 0; i < split_arg.size() && i < params.fit_params_target.size(); i++) {
-                            params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024 * 1024;
-                        }
-                    }
+                    // Value is in MiB, convert to bytes
+                    params.fit_params_target = static_cast<size_t>(std::stoi(optval_str)) * 1024 * 1024;
                } catch (const std::exception& e) {
-                    // If conversion fails, keep default value (1GB per device)
+                    // If conversion fails, keep default value (1GB)
                }
            }
        } else if (!strcmp(optname, "fit_params_min_ctx") || !strcmp(optname, "fit_ctx")) {
@@ -706,13 +686,13 @@ private:
 public:
    BackendServiceImpl(server_context& ctx) : ctx_server(ctx) {}

-    grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) override {
+    grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) {
        // Implement Health RPC
        reply->set_message("OK");
        return Status::OK;
    }

-    grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) override {
+    grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) {
        // Implement LoadModel RPC
        common_params params;
        params_parse(ctx_server, request, params);
@@ -729,72 +709,11 @@ public:
        LOG_INF("\n");
        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
        LOG_INF("\n");
-        
-        // Capture error messages during model loading
-        struct error_capture {
-            std::string captured_error;
-            std::mutex error_mutex;
-            ggml_log_callback original_callback;
-            void* original_user_data;
-        } error_capture_data;
-        
-        // Get original log callback
-        llama_log_get(&error_capture_data.original_callback, &error_capture_data.original_user_data);
-        
-        // Set custom callback to capture errors
-        llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
-            auto* capture = static_cast<error_capture*>(user_data);
-            
-            // Capture error messages
-            if (level == GGML_LOG_LEVEL_ERROR) {
-                std::lock_guard<std::mutex> lock(capture->error_mutex);
-                // Append error message, removing trailing newlines
-                std::string msg(text);
-                while (!msg.empty() && (msg.back() == '\n' || msg.back() == '\r')) {
-                    msg.pop_back();
-                }
-                if (!msg.empty()) {
-                    if (!capture->captured_error.empty()) {
-                        capture->captured_error.append("; ");
-                    }
-                    capture->captured_error.append(msg);
-                }
-            }
-            
-            // Also call original callback to preserve logging
-            if (capture->original_callback) {
-                capture->original_callback(level, text, capture->original_user_data);
-            }
-        }, &error_capture_data);
-        
        // load the model
-        bool load_success = ctx_server.load_model(params);
-        
-        // Restore original log callback
-        llama_log_set(error_capture_data.original_callback, error_capture_data.original_user_data);
-        
-        if (!load_success) {
-            std::string error_msg = "Failed to load model: " + params.model.path;
-            if (!params.mmproj.path.empty()) {
-                error_msg += " (with mmproj: " + params.mmproj.path + ")";
-            }
-            if (params.has_speculative() && !params.speculative.model.path.empty()) {
-                error_msg += " (with draft model: " + params.speculative.model.path + ")";
-            }
-            
-            // Add captured error details if available
-            {
-                std::lock_guard<std::mutex> lock(error_capture_data.error_mutex);
-                if (!error_capture_data.captured_error.empty()) {
-                    error_msg += ". Error: " + error_capture_data.captured_error;
-                } else {
-                    error_msg += ". Model file may not exist or be invalid.";
-                }
-            }
-            
-            result->set_message(error_msg);
+        if (!ctx_server.load_model(params)) {
+            result->set_message("Failed loading model");
            result->set_success(false);
-            return grpc::Status(grpc::StatusCode::INTERNAL, error_msg);
+            return Status::CANCELLED;
        }

        // Process grammar triggers now that vocab is available
@@ -1573,7 +1492,7 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) override {
+    grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
         if (params_base.model.path.empty()) {
             return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
         }
@@ -2244,7 +2163,7 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) override {
+    grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
        if (params_base.model.path.empty()) {
            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
        }
@@ -2339,7 +2258,7 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) override {
+    grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
        if (!params_base.embedding || params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
            return grpc::Status(grpc::StatusCode::UNIMPLEMENTED, "This server does not support reranking. Start it with `--reranking` and without `--embedding`");
        }
@@ -2425,7 +2344,7 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) override {
+    grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) {
        if (params_base.model.path.empty()) {
            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
        }
@@ -2448,7 +2367,7 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) override {
+    grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) {

 // request slots data using task queue
        auto rd = ctx_server.get_response_reader();
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=0e52afc6513cc2dea9a1a017afc4a008d5acf2b0
+STABLEDIFFUSION_GGML_VERSION?=9be0b91927dfa4007d053df72dea7302990226bb

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/python/chatterbox/install.sh
+++ b/backend/python/chatterbox/install.sh
@@ -15,6 +15,14 @@ fi
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
+
+# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
+# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
+# We need uv to continue falling through to the official PyPI index when it encounters these errors.
+if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
+
 EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"

 installRequirements
--- a/backend/python/diffusers/install.sh
+++ b/backend/python/diffusers/install.sh
@@ -23,4 +23,11 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
  PY_STANDALONE_TAG="20251120"
 fi

+# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
+# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
+# We need uv to continue falling through to the official PyPI index when it encounters these errors.
+if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
+
 installRequirements
--- a/backend/python/kokoro/install.sh
+++ b/backend/python/kokoro/install.sh
@@ -16,4 +16,11 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi

+# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
+# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
+# We need uv to continue falling through to the official PyPI index when it encounters these errors.
+if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
+
 installRequirements
--- a/backend/python/neutts/install.sh
+++ b/backend/python/neutts/install.sh
@@ -16,6 +16,13 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi

+# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
+# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
+# We need uv to continue falling through to the official PyPI index when it encounters these errors.
+if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
+
 if [ "x${BUILD_TYPE}" == "xcublas" ] || [ "x${BUILD_TYPE}" == "xl4t" ]; then
    export CMAKE_ARGS="-DGGML_CUDA=on"
 fi
--- a/backend/python/vibevoice/install.sh
+++ b/backend/python/vibevoice/install.sh
@@ -23,6 +23,13 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
  PY_STANDALONE_TAG="20251120"
 fi

+# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
+# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
+# We need uv to continue falling through to the official PyPI index when it encounters these errors.
+if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
+
 installRequirements

 git clone https://github.com/microsoft/VibeVoice.git
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -6111,7 +6111,6 @@
  tags:
    - embeddings
  overrides:
-    backend: llama-cpp
    embeddings: true
    parameters:
      model: granite-embedding-107m-multilingual-f16.gguf