mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-27 09:57:14 -04:00
fix(gpu-libs): bundle transitive deps of GPU runtime libs The per-vendor packagers in package-gpu-libs.sh copy an explicit allowlist of top-level GPU runtime libraries (libamdhip64, libhipblas, librocblas, the CUDA/Intel equivalents, ...) but never resolved their transitive dependencies. Backends run through the bundled lib/ld.so with LD_LIBRARY_PATH=lib, so any transitive dep not in the allowlist is a fatal "cannot open shared object file" at load time. On recent ROCm (base image rocm 7.2.1) the runtime libs link against librocprofiler-register.so.0, which is not in the allowlist, so the rocm llama-cpp backend (and every other GPU backend sharing this script) failed to load with: librocprofiler-register.so.0: cannot open shared object file The Vulkan path already solved this class of problem with copy_elf_deps (ldd-based transitive resolution), but that sweep was only wired into the Vulkan ICD path. This adds a generic sweep_transitive_deps that runs the same ldd resolution over everything the allowlist already bundled, and wires it into the ROCm, CUDA and Intel packagers. ldd returns the full recursive closure, so one pass suffices; core libc-family deps are skipped via is_core_lib so we never shadow the loader's own libc/libstdc++. Adds a self-contained regression test (gcc + ldd) that fabricates a primary lib linking a transitive lib and asserts the sweep bundles the dependency. Fixes #10537 Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -141,6 +141,38 @@ copy_elf_deps() {
|
||||
done < <(ldd "$elf" 2>/dev/null | awk '/=>/ && $3 ~ /^\// {print $3}')
|
||||
}
|
||||
|
||||
# Sweep the transitive shared-library dependencies of everything already
|
||||
# bundled in a lib dir. The per-vendor packagers below copy an explicit
|
||||
# allowlist of top-level runtime libs, but those libs pull in transitive deps
|
||||
# that aren't in the list (e.g. ROCm's librocprofiler-register.so.0, libnuma,
|
||||
# libdrm_amdgpu). Because backends run through the bundled lib/ld.so with
|
||||
# LD_LIBRARY_PATH=lib (see run.sh), an unbundled transitive dep is a hard load
|
||||
# failure (issue #10537: "librocprofiler-register.so.0: cannot open shared
|
||||
# object file"). ldd resolves the full recursive closure, so a single pass over
|
||||
# the already-bundled libs is enough; core libc-family deps are skipped via
|
||||
# copy_elf_deps/is_core_lib so we never shadow the loader's own libc/libstdc++.
|
||||
sweep_transitive_deps() {
|
||||
local dir="${1:-$TARGET_LIB_DIR}"
|
||||
command -v ldd >/dev/null 2>&1 || return 0
|
||||
|
||||
# Snapshot the current set first: copy_elf_deps adds files as it runs, and
|
||||
# ldd already returns the full recursive closure, so we only need to sweep
|
||||
# the libs that were present before the sweep started.
|
||||
# `local x=$(...)` keeps set -e from tripping on shopt -p's nonzero exit.
|
||||
local old_nullglob=$(shopt -p nullglob)
|
||||
shopt -s nullglob
|
||||
local libs=("$dir"/*.so*)
|
||||
eval "$old_nullglob"
|
||||
|
||||
local lib
|
||||
for lib in "${libs[@]}"; do
|
||||
[ -e "$lib" ] || continue
|
||||
# Skip symlinks: their real target is in the snapshot and gets swept.
|
||||
[ -L "$lib" ] && continue
|
||||
copy_elf_deps "$lib"
|
||||
done
|
||||
}
|
||||
|
||||
# Package NVIDIA CUDA libraries
|
||||
package_cuda_libs() {
|
||||
echo "Packaging CUDA libraries for BUILD_TYPE=${BUILD_TYPE}..."
|
||||
@@ -185,6 +217,10 @@ package_cuda_libs() {
|
||||
# cp -arfL /usr/local/cuda/targets "$TARGET_LIB_DIR/../cuda/" 2>/dev/null || true
|
||||
# fi
|
||||
|
||||
# Pull in transitive deps the allowlist misses so the backend is
|
||||
# self-contained (same class of failure as #10537).
|
||||
sweep_transitive_deps "$TARGET_LIB_DIR"
|
||||
|
||||
echo "CUDA libraries packaged successfully"
|
||||
}
|
||||
|
||||
@@ -261,6 +297,10 @@ package_rocm_libs() {
|
||||
fi
|
||||
done
|
||||
|
||||
# Pull in transitive deps the allowlist misses (librocprofiler-register.so.0,
|
||||
# libnuma, libdrm_amdgpu, ...) so the backend is self-contained. See #10537.
|
||||
sweep_transitive_deps "$TARGET_LIB_DIR"
|
||||
|
||||
echo "ROCm libraries packaged successfully"
|
||||
}
|
||||
|
||||
@@ -303,6 +343,10 @@ package_intel_libs() {
|
||||
fi
|
||||
done
|
||||
|
||||
# Pull in transitive deps the allowlist misses so the backend is
|
||||
# self-contained (same class of failure as #10537).
|
||||
sweep_transitive_deps "$TARGET_LIB_DIR"
|
||||
|
||||
echo "Intel oneAPI libraries packaged successfully"
|
||||
}
|
||||
|
||||
@@ -432,6 +476,7 @@ export -f copy_lib
|
||||
export -f copy_libs_glob
|
||||
export -f is_core_lib
|
||||
export -f copy_elf_deps
|
||||
export -f sweep_transitive_deps
|
||||
export -f package_cuda_libs
|
||||
export -f package_rocm_libs
|
||||
export -f package_intel_libs
|
||||
|
||||
54
scripts/build/package-gpu-libs_test.sh
Executable file
54
scripts/build/package-gpu-libs_test.sh
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
# Regression test for scripts/build/package-gpu-libs.sh.
|
||||
#
|
||||
# Guards issue #10537: the per-vendor packagers copy an explicit allowlist of
|
||||
# top-level GPU runtime libs but used to miss their transitive dependencies
|
||||
# (e.g. ROCm's librocprofiler-register.so.0). Since backends run through the
|
||||
# bundled lib/ld.so with LD_LIBRARY_PATH=lib, an unbundled transitive dep is a
|
||||
# fatal "cannot open shared object file" at load time.
|
||||
#
|
||||
# This test fabricates a primary lib that links a transitive lib, simulates the
|
||||
# allowlist step (primary copied, transitive not), and asserts the transitive
|
||||
# sweep pulls the dependency in. Requires gcc + ldd (present in build images).
|
||||
set -euo pipefail
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
SCRIPT="$CURDIR/package-gpu-libs.sh"
|
||||
|
||||
if ! command -v gcc >/dev/null 2>&1 || ! command -v ldd >/dev/null 2>&1; then
|
||||
echo "SKIP: gcc/ldd not available"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
WORK=$(mktemp -d)
|
||||
trap 'rm -rf "$WORK"' EXIT
|
||||
|
||||
# Transitive dependency (stand-in for librocprofiler-register.so.0).
|
||||
echo 'int transitive_fn(void){return 42;}' > "$WORK/transitive.c"
|
||||
gcc -shared -fPIC -o "$WORK/libfaketransitive.so.0" "$WORK/transitive.c"
|
||||
|
||||
# Primary allowlisted lib (stand-in for libhipblas.so) that links it.
|
||||
echo 'int transitive_fn(void); int primary_fn(void){return transitive_fn();}' > "$WORK/primary.c"
|
||||
gcc -shared -fPIC -o "$WORK/libfakeprimary.so.0" "$WORK/primary.c" \
|
||||
-L"$WORK" -l:libfaketransitive.so.0 -Wl,-rpath,"$WORK"
|
||||
|
||||
# Simulate the allowlist step: primary already bundled, transitive not.
|
||||
TARGET="$WORK/target"
|
||||
mkdir -p "$TARGET"
|
||||
cp "$WORK/libfakeprimary.so.0" "$TARGET/"
|
||||
|
||||
# Make the transitive dep resolvable like /opt/rocm libs are in the build image.
|
||||
export LD_LIBRARY_PATH="$WORK:${LD_LIBRARY_PATH:-}"
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source "$SCRIPT" "$TARGET"
|
||||
sweep_transitive_deps "$TARGET"
|
||||
|
||||
if [ -e "$TARGET/libfaketransitive.so.0" ]; then
|
||||
echo "PASS: transitive dependency was bundled by sweep_transitive_deps"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "FAIL: transitive dependency was NOT bundled (regression of #10537)"
|
||||
ls -la "$TARGET"
|
||||
exit 1
|
||||
Reference in New Issue
Block a user