From 1c604d1b198b7f3503220504eef5cc4ae13c4266 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 3 Jul 2026 20:30:24 +0000 Subject: [PATCH] fix(gpu-libs): bundle hipBLASLt TensileLibrary data so ROCm backends stop falling back (#10660) The ROCm packager copied rocBLAS kernel data (rocblas/library/*.dat) into the bundled lib/ dir and run.sh pointed ROCBLAS_TENSILE_LIBPATH at it, but the parallel hipBLASLt data dir (hipblaslt/library/TensileLibrary_lazy_gfx*.dat) was never packaged and no HIPBLASLT_TENSILE_LIBPATH was set. The bundled libhipblaslt.so therefore resolved its per-arch kernel data relative to itself, found nothing, and silently fell back to slow generic kernels, logging: rocblaslt error: Cannot read "TensileLibrary_lazy_gfx1201.dat": No such file or directory rocblaslt error: Could not load "TensileLibrary_lazy_gfx1201.dat" Fix, mirroring the existing rocBLAS handling: - package-gpu-libs.sh: extract the rocblas data-dir copy into a reusable copy_rocm_data_dir helper and call it for both rocblas and hipblaslt. - llama-cpp/turboquant run.sh: export HIPBLASLT_TENSILE_LIBPATH when the bundled hipblaslt/library dir exists. The helper takes an optional ROCM_BASE_DIRS override so the copy is unit testable without a real ROCm install; add a regression test that runs package_rocm_libs against a fabricated ROCm tree and asserts both data dirs are bundled. Note: this bundles whatever gfx*.dat the build image's ROCm provides. If a given arch's tensile data is absent from the shipped ROCm, that arch still needs a ROCm bump; the packaging gap itself is fixed for every supported arch. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --- backend/cpp/llama-cpp/run.sh | 6 ++ backend/cpp/turboquant/run.sh | 6 ++ .../build/package-gpu-libs-rocm-data_test.sh | 57 +++++++++++++++ scripts/build/package-gpu-libs.sh | 72 ++++++++++++++----- 4 files changed, 122 insertions(+), 19 deletions(-) create mode 100755 scripts/build/package-gpu-libs-rocm-data_test.sh diff --git a/backend/cpp/llama-cpp/run.sh b/backend/cpp/llama-cpp/run.sh index 09a13b0fe..1ccc1a37b 100755 --- a/backend/cpp/llama-cpp/run.sh +++ b/backend/cpp/llama-cpp/run.sh @@ -36,6 +36,12 @@ else if [ -d "$CURDIR/lib/rocblas/library" ]; then export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library fi + # Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its + # TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at + # the bundled data or it falls back to slow generic kernels (issue #10660). + if [ -d "$CURDIR/lib/hipblaslt/library" ]; then + export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library + fi fi # If there is a lib/ld.so, use it diff --git a/backend/cpp/turboquant/run.sh b/backend/cpp/turboquant/run.sh index 33864385d..84db6985a 100755 --- a/backend/cpp/turboquant/run.sh +++ b/backend/cpp/turboquant/run.sh @@ -34,6 +34,12 @@ else if [ -d "$CURDIR/lib/rocblas/library" ]; then export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library fi + # Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its + # TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at + # the bundled data or it falls back to slow generic kernels (issue #10660). + if [ -d "$CURDIR/lib/hipblaslt/library" ]; then + export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library + fi fi # If there is a lib/ld.so, use it diff --git a/scripts/build/package-gpu-libs-rocm-data_test.sh b/scripts/build/package-gpu-libs-rocm-data_test.sh new file mode 100755 index 000000000..2d4a9aacf --- /dev/null +++ b/scripts/build/package-gpu-libs-rocm-data_test.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Regression test for scripts/build/package-gpu-libs.sh ROCm data bundling. +# +# Guards issue #10660: hipBLASLt (rocblaslt) resolves its TensileLibrary_lazy_gfx*.dat +# kernel data relative to the bundled libhipblaslt.so. The packager copied the +# rocblas/ data dir but not the hipblaslt/ data dir, so the bundled backend +# fell back to slow generic kernels and logged +# rocblaslt error: Cannot read "TensileLibrary_lazy_gfx1201.dat": No such file or directory +# +# This test fabricates a fake ROCm tree containing both rocblas/ and hipblaslt/ +# tensile data, points the packager at it via ROCM_BASE_DIRS, and asserts BOTH +# data directories are bundled into the target lib dir. +set -euo pipefail + +CURDIR=$(dirname "$(realpath "$0")") +SCRIPT="$CURDIR/package-gpu-libs.sh" + +WORK=$(mktemp -d) +trap 'rm -rf "$WORK"' EXIT + +# Fabricate a fake ROCm install with both rocblas and hipblaslt tensile data. +FAKE_ROCM="$WORK/opt/rocm" +mkdir -p "$FAKE_ROCM/lib/rocblas/library" +mkdir -p "$FAKE_ROCM/lib/hipblaslt/library" +echo "fake rocblas tensile" > "$FAKE_ROCM/lib/rocblas/library/TensileLibrary_lazy_gfx1201.dat" +echo "fake hipblaslt tensile" > "$FAKE_ROCM/lib/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat" + +TARGET="$WORK/target" +mkdir -p "$TARGET" + +# shellcheck source=/dev/null +source "$SCRIPT" "$TARGET" + +# Point the data-dir copy at the fabricated tree instead of the real /opt/rocm, +# then run the actual ROCm packager. This asserts package_rocm_libs itself +# bundles BOTH data dirs, not just that the helper works in isolation. +export BUILD_TYPE=hipblas +export ROCM_BASE_DIRS="$FAKE_ROCM" +package_rocm_libs + +fail=false +if [ ! -e "$TARGET/rocblas/library/TensileLibrary_lazy_gfx1201.dat" ]; then + echo "FAIL: rocblas tensile data was NOT bundled" + fail=true +fi +if [ ! -e "$TARGET/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat" ]; then + echo "FAIL: hipblaslt tensile data was NOT bundled (regression of #10660)" + fail=true +fi + +if [ "$fail" = true ]; then + ls -R "$TARGET" || true + exit 1 +fi + +echo "PASS: rocblas and hipblaslt tensile data were both bundled" +exit 0 diff --git a/scripts/build/package-gpu-libs.sh b/scripts/build/package-gpu-libs.sh index 17c0d0ca8..1e439d23d 100755 --- a/scripts/build/package-gpu-libs.sh +++ b/scripts/build/package-gpu-libs.sh @@ -224,6 +224,50 @@ package_cuda_libs() { echo "CUDA libraries packaged successfully" } +# Copy a ROCm library data subdirectory (e.g. rocblas, hipblaslt) into the +# bundled lib/ dir. These directories hold the TensileLibrary_*.dat GPU kernel +# tuning files, which rocBLAS/hipBLASLt load at runtime *relative to their own +# .so*. Since backends ship their own copies of libhipblaslt.so/librocblas.so +# under lib/, the matching data dir must travel with them or the libs fall back +# to slow generic kernels (rocblaslt error: Cannot read TensileLibrary_lazy_gfx*.dat; +# see issue #10660). +# +# The ROCm search roots default to /opt/rocm{,-*} but can be overridden via the +# ROCM_BASE_DIRS env var (space-separated), which keeps the copy unit-testable +# without a real ROCm install. +# Args: $1 = data subdir name found under /lib{,64}/ +copy_rocm_data_dir() { + local data_name="$1" + # Single-line `local x=$(...)` on purpose: `local` masks the command + # substitution's exit status, which is 1 when nullglob is unset and would + # otherwise trip the script's `set -e`. + local old_nullglob=$(shopt -p nullglob) + shopt -s nullglob + local rocm_dirs + if [ -n "${ROCM_BASE_DIRS:-}" ]; then + # shellcheck disable=SC2206 # intentional word-split of the override + rocm_dirs=(${ROCM_BASE_DIRS}) + else + rocm_dirs=(/opt/rocm /opt/rocm-*) + fi + eval "$old_nullglob" + local found=false + local rocm_base lib_subdir + for rocm_base in "${rocm_dirs[@]}"; do + for lib_subdir in lib lib64; do + if [ -d "$rocm_base/$lib_subdir/$data_name" ]; then + echo "Found $data_name data at $rocm_base/$lib_subdir/$data_name" + mkdir -p "$TARGET_LIB_DIR/$data_name" + cp -arfL "$rocm_base/$lib_subdir/$data_name/"* "$TARGET_LIB_DIR/$data_name/" || echo "WARNING: Failed to copy $data_name data from $rocm_base/$lib_subdir/$data_name" + found=true + fi + done + done + if [ "$found" = false ]; then + echo "WARNING: No $data_name library data found in ${ROCM_BASE_DIRS:-/opt/rocm*}/lib{,64}/$data_name" + fi +} + # Package AMD ROCm/HIPBlas libraries package_rocm_libs() { echo "Packaging ROCm/HIPBlas libraries for BUILD_TYPE=${BUILD_TYPE}..." @@ -267,27 +311,16 @@ package_rocm_libs() { fi done - # Copy rocblas library data (tuning files, TensileLibrary, etc.) - local old_nullglob=$(shopt -p nullglob) - shopt -s nullglob - local rocm_dirs=(/opt/rocm /opt/rocm-*) - eval "$old_nullglob" - local rocblas_found=false - for rocm_base in "${rocm_dirs[@]}"; do - for lib_subdir in lib lib64; do - if [ -d "$rocm_base/$lib_subdir/rocblas" ]; then - echo "Found rocblas data at $rocm_base/$lib_subdir/rocblas" - mkdir -p "$TARGET_LIB_DIR/rocblas" - cp -arfL "$rocm_base/$lib_subdir/rocblas/"* "$TARGET_LIB_DIR/rocblas/" || echo "WARNING: Failed to copy rocblas data from $rocm_base/$lib_subdir/rocblas" - rocblas_found=true - fi - done - done - if [ "$rocblas_found" = false ]; then - echo "WARNING: No rocblas library data found in /opt/rocm*/lib{,64}/rocblas" - fi + # Copy rocBLAS and hipBLASLt kernel data (TensileLibrary_*.dat tuning files) + # so the bundled libs find their per-arch kernels at runtime instead of + # falling back to slow generic code (see copy_rocm_data_dir / issue #10660). + copy_rocm_data_dir rocblas + copy_rocm_data_dir hipblaslt # Copy libomp from LLVM (required for ROCm) + # Single-line `local x=$(...)` on purpose: masks shopt -p's nonzero exit + # (nullglob unset) so it doesn't trip `set -e`. + local old_nullglob=$(shopt -p nullglob) shopt -s nullglob local omp_libs=(/opt/rocm*/lib/llvm/lib/libomp.so*) eval "$old_nullglob" @@ -477,6 +510,7 @@ export -f copy_libs_glob export -f is_core_lib export -f copy_elf_deps export -f sweep_transitive_deps +export -f copy_rocm_data_dir export -f package_cuda_libs export -f package_rocm_libs export -f package_intel_libs