fix(gpu-libs): bundle transitive deps of GPU runtime libs (#10537) (#10539)

fix(gpu-libs): bundle transitive deps of GPU runtime libs

The per-vendor packagers in package-gpu-libs.sh copy an explicit allowlist
of top-level GPU runtime libraries (libamdhip64, libhipblas, librocblas, the
CUDA/Intel equivalents, ...) but never resolved their transitive
dependencies. Backends run through the bundled lib/ld.so with
LD_LIBRARY_PATH=lib, so any transitive dep not in the allowlist is a fatal
"cannot open shared object file" at load time.

On recent ROCm (base image rocm 7.2.1) the runtime libs link against
librocprofiler-register.so.0, which is not in the allowlist, so the rocm
llama-cpp backend (and every other GPU backend sharing this script) failed
to load with:

  librocprofiler-register.so.0: cannot open shared object file

The Vulkan path already solved this class of problem with copy_elf_deps
(ldd-based transitive resolution), but that sweep was only wired into the
Vulkan ICD path. This adds a generic sweep_transitive_deps that runs the
same ldd resolution over everything the allowlist already bundled, and wires
it into the ROCm, CUDA and Intel packagers. ldd returns the full recursive
closure, so one pass suffices; core libc-family deps are skipped via
is_core_lib so we never shadow the loader's own libc/libstdc++.

Adds a self-contained regression test (gcc + ldd) that fabricates a primary
lib linking a transitive lib and asserts the sweep bundles the dependency.

Fixes #10537

Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
LocalAI [bot]
2026-06-27 01:36:33 +02:00
committed by GitHub
parent 2c96c2d08e
commit f98b0f1c1e
2 changed files with 99 additions and 0 deletions

View File

@@ -141,6 +141,38 @@ copy_elf_deps() {
done < <(ldd "$elf" 2>/dev/null | awk '/=>/ && $3 ~ /^\// {print $3}')
}
# Sweep the transitive shared-library dependencies of everything already
# bundled in a lib dir. The per-vendor packagers below copy an explicit
# allowlist of top-level runtime libs, but those libs pull in transitive deps
# that aren't in the list (e.g. ROCm's librocprofiler-register.so.0, libnuma,
# libdrm_amdgpu). Because backends run through the bundled lib/ld.so with
# LD_LIBRARY_PATH=lib (see run.sh), an unbundled transitive dep is a hard load
# failure (issue #10537: "librocprofiler-register.so.0: cannot open shared
# object file"). ldd resolves the full recursive closure, so a single pass over
# the already-bundled libs is enough; core libc-family deps are skipped via
# copy_elf_deps/is_core_lib so we never shadow the loader's own libc/libstdc++.
sweep_transitive_deps() {
local dir="${1:-$TARGET_LIB_DIR}"
command -v ldd >/dev/null 2>&1 || return 0
# Snapshot the current set first: copy_elf_deps adds files as it runs, and
# ldd already returns the full recursive closure, so we only need to sweep
# the libs that were present before the sweep started.
# `local x=$(...)` keeps set -e from tripping on shopt -p's nonzero exit.
local old_nullglob=$(shopt -p nullglob)
shopt -s nullglob
local libs=("$dir"/*.so*)
eval "$old_nullglob"
local lib
for lib in "${libs[@]}"; do
[ -e "$lib" ] || continue
# Skip symlinks: their real target is in the snapshot and gets swept.
[ -L "$lib" ] && continue
copy_elf_deps "$lib"
done
}
# Package NVIDIA CUDA libraries
package_cuda_libs() {
echo "Packaging CUDA libraries for BUILD_TYPE=${BUILD_TYPE}..."
@@ -185,6 +217,10 @@ package_cuda_libs() {
# cp -arfL /usr/local/cuda/targets "$TARGET_LIB_DIR/../cuda/" 2>/dev/null || true
# fi
# Pull in transitive deps the allowlist misses so the backend is
# self-contained (same class of failure as #10537).
sweep_transitive_deps "$TARGET_LIB_DIR"
echo "CUDA libraries packaged successfully"
}
@@ -261,6 +297,10 @@ package_rocm_libs() {
fi
done
# Pull in transitive deps the allowlist misses (librocprofiler-register.so.0,
# libnuma, libdrm_amdgpu, ...) so the backend is self-contained. See #10537.
sweep_transitive_deps "$TARGET_LIB_DIR"
echo "ROCm libraries packaged successfully"
}
@@ -303,6 +343,10 @@ package_intel_libs() {
fi
done
# Pull in transitive deps the allowlist misses so the backend is
# self-contained (same class of failure as #10537).
sweep_transitive_deps "$TARGET_LIB_DIR"
echo "Intel oneAPI libraries packaged successfully"
}
@@ -432,6 +476,7 @@ export -f copy_lib
export -f copy_libs_glob
export -f is_core_lib
export -f copy_elf_deps
export -f sweep_transitive_deps
export -f package_cuda_libs
export -f package_rocm_libs
export -f package_intel_libs

View File

@@ -0,0 +1,54 @@
#!/bin/bash
# Regression test for scripts/build/package-gpu-libs.sh.
#
# Guards issue #10537: the per-vendor packagers copy an explicit allowlist of
# top-level GPU runtime libs but used to miss their transitive dependencies
# (e.g. ROCm's librocprofiler-register.so.0). Since backends run through the
# bundled lib/ld.so with LD_LIBRARY_PATH=lib, an unbundled transitive dep is a
# fatal "cannot open shared object file" at load time.
#
# This test fabricates a primary lib that links a transitive lib, simulates the
# allowlist step (primary copied, transitive not), and asserts the transitive
# sweep pulls the dependency in. Requires gcc + ldd (present in build images).
set -euo pipefail
CURDIR=$(dirname "$(realpath "$0")")
SCRIPT="$CURDIR/package-gpu-libs.sh"
if ! command -v gcc >/dev/null 2>&1 || ! command -v ldd >/dev/null 2>&1; then
echo "SKIP: gcc/ldd not available"
exit 0
fi
WORK=$(mktemp -d)
trap 'rm -rf "$WORK"' EXIT
# Transitive dependency (stand-in for librocprofiler-register.so.0).
echo 'int transitive_fn(void){return 42;}' > "$WORK/transitive.c"
gcc -shared -fPIC -o "$WORK/libfaketransitive.so.0" "$WORK/transitive.c"
# Primary allowlisted lib (stand-in for libhipblas.so) that links it.
echo 'int transitive_fn(void); int primary_fn(void){return transitive_fn();}' > "$WORK/primary.c"
gcc -shared -fPIC -o "$WORK/libfakeprimary.so.0" "$WORK/primary.c" \
-L"$WORK" -l:libfaketransitive.so.0 -Wl,-rpath,"$WORK"
# Simulate the allowlist step: primary already bundled, transitive not.
TARGET="$WORK/target"
mkdir -p "$TARGET"
cp "$WORK/libfakeprimary.so.0" "$TARGET/"
# Make the transitive dep resolvable like /opt/rocm libs are in the build image.
export LD_LIBRARY_PATH="$WORK:${LD_LIBRARY_PATH:-}"
# shellcheck source=/dev/null
source "$SCRIPT" "$TARGET"
sweep_transitive_deps "$TARGET"
if [ -e "$TARGET/libfaketransitive.so.0" ]; then
echo "PASS: transitive dependency was bundled by sweep_transitive_deps"
exit 0
fi
echo "FAIL: transitive dependency was NOT bundled (regression of #10537)"
ls -la "$TARGET"
exit 1