Compare commits

...

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
1c604d1b19 fix(gpu-libs): bundle hipBLASLt TensileLibrary data so ROCm backends stop falling back (#10660)
The ROCm packager copied rocBLAS kernel data (rocblas/library/*.dat) into the
bundled lib/ dir and run.sh pointed ROCBLAS_TENSILE_LIBPATH at it, but the
parallel hipBLASLt data dir (hipblaslt/library/TensileLibrary_lazy_gfx*.dat)
was never packaged and no HIPBLASLT_TENSILE_LIBPATH was set. The bundled
libhipblaslt.so therefore resolved its per-arch kernel data relative to itself,
found nothing, and silently fell back to slow generic kernels, logging:

    rocblaslt error: Cannot read "TensileLibrary_lazy_gfx1201.dat": No such file or directory
    rocblaslt error: Could not load "TensileLibrary_lazy_gfx1201.dat"

Fix, mirroring the existing rocBLAS handling:
- package-gpu-libs.sh: extract the rocblas data-dir copy into a reusable
  copy_rocm_data_dir helper and call it for both rocblas and hipblaslt.
- llama-cpp/turboquant run.sh: export HIPBLASLT_TENSILE_LIBPATH when the
  bundled hipblaslt/library dir exists.

The helper takes an optional ROCM_BASE_DIRS override so the copy is unit
testable without a real ROCm install; add a regression test that runs
package_rocm_libs against a fabricated ROCm tree and asserts both data dirs
are bundled.

Note: this bundles whatever gfx*.dat the build image's ROCm provides. If a
given arch's tensile data is absent from the shipped ROCm, that arch still
needs a ROCm bump; the packaging gap itself is fixed for every supported arch.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]
2026-07-03 20:30:24 +00:00
4 changed files with 122 additions and 19 deletions

View File

@@ -36,6 +36,12 @@ else
if [ -d "$CURDIR/lib/rocblas/library" ]; then
export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
fi
# Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its
# TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at
# the bundled data or it falls back to slow generic kernels (issue #10660).
if [ -d "$CURDIR/lib/hipblaslt/library" ]; then
export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library
fi
fi
# If there is a lib/ld.so, use it

View File

@@ -34,6 +34,12 @@ else
if [ -d "$CURDIR/lib/rocblas/library" ]; then
export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
fi
# Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its
# TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at
# the bundled data or it falls back to slow generic kernels (issue #10660).
if [ -d "$CURDIR/lib/hipblaslt/library" ]; then
export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library
fi
fi
# If there is a lib/ld.so, use it

View File

@@ -0,0 +1,57 @@
#!/bin/bash
# Regression test for scripts/build/package-gpu-libs.sh ROCm data bundling.
#
# Guards issue #10660: hipBLASLt (rocblaslt) resolves its TensileLibrary_lazy_gfx*.dat
# kernel data relative to the bundled libhipblaslt.so. The packager copied the
# rocblas/ data dir but not the hipblaslt/ data dir, so the bundled backend
# fell back to slow generic kernels and logged
# rocblaslt error: Cannot read "TensileLibrary_lazy_gfx1201.dat": No such file or directory
#
# This test fabricates a fake ROCm tree containing both rocblas/ and hipblaslt/
# tensile data, points the packager at it via ROCM_BASE_DIRS, and asserts BOTH
# data directories are bundled into the target lib dir.
set -euo pipefail
CURDIR=$(dirname "$(realpath "$0")")
SCRIPT="$CURDIR/package-gpu-libs.sh"
WORK=$(mktemp -d)
trap 'rm -rf "$WORK"' EXIT
# Fabricate a fake ROCm install with both rocblas and hipblaslt tensile data.
FAKE_ROCM="$WORK/opt/rocm"
mkdir -p "$FAKE_ROCM/lib/rocblas/library"
mkdir -p "$FAKE_ROCM/lib/hipblaslt/library"
echo "fake rocblas tensile" > "$FAKE_ROCM/lib/rocblas/library/TensileLibrary_lazy_gfx1201.dat"
echo "fake hipblaslt tensile" > "$FAKE_ROCM/lib/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat"
TARGET="$WORK/target"
mkdir -p "$TARGET"
# shellcheck source=/dev/null
source "$SCRIPT" "$TARGET"
# Point the data-dir copy at the fabricated tree instead of the real /opt/rocm,
# then run the actual ROCm packager. This asserts package_rocm_libs itself
# bundles BOTH data dirs, not just that the helper works in isolation.
export BUILD_TYPE=hipblas
export ROCM_BASE_DIRS="$FAKE_ROCM"
package_rocm_libs
fail=false
if [ ! -e "$TARGET/rocblas/library/TensileLibrary_lazy_gfx1201.dat" ]; then
echo "FAIL: rocblas tensile data was NOT bundled"
fail=true
fi
if [ ! -e "$TARGET/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat" ]; then
echo "FAIL: hipblaslt tensile data was NOT bundled (regression of #10660)"
fail=true
fi
if [ "$fail" = true ]; then
ls -R "$TARGET" || true
exit 1
fi
echo "PASS: rocblas and hipblaslt tensile data were both bundled"
exit 0

View File

@@ -224,6 +224,50 @@ package_cuda_libs() {
echo "CUDA libraries packaged successfully"
}
# Copy a ROCm library data subdirectory (e.g. rocblas, hipblaslt) into the
# bundled lib/ dir. These directories hold the TensileLibrary_*.dat GPU kernel
# tuning files, which rocBLAS/hipBLASLt load at runtime *relative to their own
# .so*. Since backends ship their own copies of libhipblaslt.so/librocblas.so
# under lib/, the matching data dir must travel with them or the libs fall back
# to slow generic kernels (rocblaslt error: Cannot read TensileLibrary_lazy_gfx*.dat;
# see issue #10660).
#
# The ROCm search roots default to /opt/rocm{,-*} but can be overridden via the
# ROCM_BASE_DIRS env var (space-separated), which keeps the copy unit-testable
# without a real ROCm install.
# Args: $1 = data subdir name found under <rocm-root>/lib{,64}/
copy_rocm_data_dir() {
local data_name="$1"
# Single-line `local x=$(...)` on purpose: `local` masks the command
# substitution's exit status, which is 1 when nullglob is unset and would
# otherwise trip the script's `set -e`.
local old_nullglob=$(shopt -p nullglob)
shopt -s nullglob
local rocm_dirs
if [ -n "${ROCM_BASE_DIRS:-}" ]; then
# shellcheck disable=SC2206 # intentional word-split of the override
rocm_dirs=(${ROCM_BASE_DIRS})
else
rocm_dirs=(/opt/rocm /opt/rocm-*)
fi
eval "$old_nullglob"
local found=false
local rocm_base lib_subdir
for rocm_base in "${rocm_dirs[@]}"; do
for lib_subdir in lib lib64; do
if [ -d "$rocm_base/$lib_subdir/$data_name" ]; then
echo "Found $data_name data at $rocm_base/$lib_subdir/$data_name"
mkdir -p "$TARGET_LIB_DIR/$data_name"
cp -arfL "$rocm_base/$lib_subdir/$data_name/"* "$TARGET_LIB_DIR/$data_name/" || echo "WARNING: Failed to copy $data_name data from $rocm_base/$lib_subdir/$data_name"
found=true
fi
done
done
if [ "$found" = false ]; then
echo "WARNING: No $data_name library data found in ${ROCM_BASE_DIRS:-/opt/rocm*}/lib{,64}/$data_name"
fi
}
# Package AMD ROCm/HIPBlas libraries
package_rocm_libs() {
echo "Packaging ROCm/HIPBlas libraries for BUILD_TYPE=${BUILD_TYPE}..."
@@ -267,27 +311,16 @@ package_rocm_libs() {
fi
done
# Copy rocblas library data (tuning files, TensileLibrary, etc.)
local old_nullglob=$(shopt -p nullglob)
shopt -s nullglob
local rocm_dirs=(/opt/rocm /opt/rocm-*)
eval "$old_nullglob"
local rocblas_found=false
for rocm_base in "${rocm_dirs[@]}"; do
for lib_subdir in lib lib64; do
if [ -d "$rocm_base/$lib_subdir/rocblas" ]; then
echo "Found rocblas data at $rocm_base/$lib_subdir/rocblas"
mkdir -p "$TARGET_LIB_DIR/rocblas"
cp -arfL "$rocm_base/$lib_subdir/rocblas/"* "$TARGET_LIB_DIR/rocblas/" || echo "WARNING: Failed to copy rocblas data from $rocm_base/$lib_subdir/rocblas"
rocblas_found=true
fi
done
done
if [ "$rocblas_found" = false ]; then
echo "WARNING: No rocblas library data found in /opt/rocm*/lib{,64}/rocblas"
fi
# Copy rocBLAS and hipBLASLt kernel data (TensileLibrary_*.dat tuning files)
# so the bundled libs find their per-arch kernels at runtime instead of
# falling back to slow generic code (see copy_rocm_data_dir / issue #10660).
copy_rocm_data_dir rocblas
copy_rocm_data_dir hipblaslt
# Copy libomp from LLVM (required for ROCm)
# Single-line `local x=$(...)` on purpose: masks shopt -p's nonzero exit
# (nullglob unset) so it doesn't trip `set -e`.
local old_nullglob=$(shopt -p nullglob)
shopt -s nullglob
local omp_libs=(/opt/rocm*/lib/llvm/lib/libomp.so*)
eval "$old_nullglob"
@@ -477,6 +510,7 @@ export -f copy_libs_glob
export -f is_core_lib
export -f copy_elf_deps
export -f sweep_transitive_deps
export -f copy_rocm_data_dir
export -f package_cuda_libs
export -f package_rocm_libs
export -f package_intel_libs