diff --git a/scripts/build/package-gpu-libs.sh b/scripts/build/package-gpu-libs.sh index 40f410173..17c0d0ca8 100755 --- a/scripts/build/package-gpu-libs.sh +++ b/scripts/build/package-gpu-libs.sh @@ -141,6 +141,38 @@ copy_elf_deps() { done < <(ldd "$elf" 2>/dev/null | awk '/=>/ && $3 ~ /^\// {print $3}') } +# Sweep the transitive shared-library dependencies of everything already +# bundled in a lib dir. The per-vendor packagers below copy an explicit +# allowlist of top-level runtime libs, but those libs pull in transitive deps +# that aren't in the list (e.g. ROCm's librocprofiler-register.so.0, libnuma, +# libdrm_amdgpu). Because backends run through the bundled lib/ld.so with +# LD_LIBRARY_PATH=lib (see run.sh), an unbundled transitive dep is a hard load +# failure (issue #10537: "librocprofiler-register.so.0: cannot open shared +# object file"). ldd resolves the full recursive closure, so a single pass over +# the already-bundled libs is enough; core libc-family deps are skipped via +# copy_elf_deps/is_core_lib so we never shadow the loader's own libc/libstdc++. +sweep_transitive_deps() { + local dir="${1:-$TARGET_LIB_DIR}" + command -v ldd >/dev/null 2>&1 || return 0 + + # Snapshot the current set first: copy_elf_deps adds files as it runs, and + # ldd already returns the full recursive closure, so we only need to sweep + # the libs that were present before the sweep started. + # `local x=$(...)` keeps set -e from tripping on shopt -p's nonzero exit. + local old_nullglob=$(shopt -p nullglob) + shopt -s nullglob + local libs=("$dir"/*.so*) + eval "$old_nullglob" + + local lib + for lib in "${libs[@]}"; do + [ -e "$lib" ] || continue + # Skip symlinks: their real target is in the snapshot and gets swept. + [ -L "$lib" ] && continue + copy_elf_deps "$lib" + done +} + # Package NVIDIA CUDA libraries package_cuda_libs() { echo "Packaging CUDA libraries for BUILD_TYPE=${BUILD_TYPE}..." @@ -185,6 +217,10 @@ package_cuda_libs() { # cp -arfL /usr/local/cuda/targets "$TARGET_LIB_DIR/../cuda/" 2>/dev/null || true # fi + # Pull in transitive deps the allowlist misses so the backend is + # self-contained (same class of failure as #10537). + sweep_transitive_deps "$TARGET_LIB_DIR" + echo "CUDA libraries packaged successfully" } @@ -261,6 +297,10 @@ package_rocm_libs() { fi done + # Pull in transitive deps the allowlist misses (librocprofiler-register.so.0, + # libnuma, libdrm_amdgpu, ...) so the backend is self-contained. See #10537. + sweep_transitive_deps "$TARGET_LIB_DIR" + echo "ROCm libraries packaged successfully" } @@ -303,6 +343,10 @@ package_intel_libs() { fi done + # Pull in transitive deps the allowlist misses so the backend is + # self-contained (same class of failure as #10537). + sweep_transitive_deps "$TARGET_LIB_DIR" + echo "Intel oneAPI libraries packaged successfully" } @@ -432,6 +476,7 @@ export -f copy_lib export -f copy_libs_glob export -f is_core_lib export -f copy_elf_deps +export -f sweep_transitive_deps export -f package_cuda_libs export -f package_rocm_libs export -f package_intel_libs diff --git a/scripts/build/package-gpu-libs_test.sh b/scripts/build/package-gpu-libs_test.sh new file mode 100755 index 000000000..39f8331c0 --- /dev/null +++ b/scripts/build/package-gpu-libs_test.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Regression test for scripts/build/package-gpu-libs.sh. +# +# Guards issue #10537: the per-vendor packagers copy an explicit allowlist of +# top-level GPU runtime libs but used to miss their transitive dependencies +# (e.g. ROCm's librocprofiler-register.so.0). Since backends run through the +# bundled lib/ld.so with LD_LIBRARY_PATH=lib, an unbundled transitive dep is a +# fatal "cannot open shared object file" at load time. +# +# This test fabricates a primary lib that links a transitive lib, simulates the +# allowlist step (primary copied, transitive not), and asserts the transitive +# sweep pulls the dependency in. Requires gcc + ldd (present in build images). +set -euo pipefail + +CURDIR=$(dirname "$(realpath "$0")") +SCRIPT="$CURDIR/package-gpu-libs.sh" + +if ! command -v gcc >/dev/null 2>&1 || ! command -v ldd >/dev/null 2>&1; then + echo "SKIP: gcc/ldd not available" + exit 0 +fi + +WORK=$(mktemp -d) +trap 'rm -rf "$WORK"' EXIT + +# Transitive dependency (stand-in for librocprofiler-register.so.0). +echo 'int transitive_fn(void){return 42;}' > "$WORK/transitive.c" +gcc -shared -fPIC -o "$WORK/libfaketransitive.so.0" "$WORK/transitive.c" + +# Primary allowlisted lib (stand-in for libhipblas.so) that links it. +echo 'int transitive_fn(void); int primary_fn(void){return transitive_fn();}' > "$WORK/primary.c" +gcc -shared -fPIC -o "$WORK/libfakeprimary.so.0" "$WORK/primary.c" \ + -L"$WORK" -l:libfaketransitive.so.0 -Wl,-rpath,"$WORK" + +# Simulate the allowlist step: primary already bundled, transitive not. +TARGET="$WORK/target" +mkdir -p "$TARGET" +cp "$WORK/libfakeprimary.so.0" "$TARGET/" + +# Make the transitive dep resolvable like /opt/rocm libs are in the build image. +export LD_LIBRARY_PATH="$WORK:${LD_LIBRARY_PATH:-}" + +# shellcheck source=/dev/null +source "$SCRIPT" "$TARGET" +sweep_transitive_deps "$TARGET" + +if [ -e "$TARGET/libfaketransitive.so.0" ]; then + echo "PASS: transitive dependency was bundled by sweep_transitive_deps" + exit 0 +fi + +echo "FAIL: transitive dependency was NOT bundled (regression of #10537)" +ls -la "$TARGET" +exit 1