fix(backends): enable ROCm/HIP GPU offload for ggml audio backends (#10666 ) (#10667 )

qwen3-tts-cpp, omnivoice-cpp, acestep-cpp and vibevoice-cpp shipped rocm-* variants that silently ran on CPU ([Load] backend: CPU). Two coupled defects: - The Makefiles passed -DGGML_HIPBLAS=ON, but the vendored ggml only understands -DGGML_HIP=ON (GGML_HIPBLAS was removed upstream), so the ggml-hip backend target was never created and no GPU code was built. - The CMake foreach that links the ggml GPU backends into the module listed blas/cuda/metal/vulkan but not hip, so even a built ggml-hip would not have been linked and its static backend registration would never run. CUDA users were unaffected because cublas passes the correct GGML_CUDA=ON and the foreach already links cuda. Mirror the proven llama-cpp hipblas block (ROCm clang CC/CXX + AMDGPU_TARGETS) and add hip to each foreach. Upstream picks the best device via ggml_backend_init_best(), so no runtime flag is needed once HIP is compiled and linked. Assisted-by: Claude:claude-opus-4-8[1m] [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
chore: ⬆️ Update CrispStrobe/CrispASR to f35185b876fc482fcb2053a81a2697936ed5fcc0 (#10670 )
2026-07-04 05:16:42 -04:00 · 2026-07-04 09:08:20 +02:00 · 2026-07-04 08:17:02 +02:00 · 2026-07-04 08:16:41 +02:00 · 2026-07-04 08:14:12 +02:00
33 changed files with 184 additions and 53 deletions
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=fdb1db877c526ec90f668eca1b858da5dba85560
+LLAMA_VERSION?=d4cff114c0084f1fbc9b4c62717eca8fb2ae494a
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/run.sh
+++ b/backend/cpp/llama-cpp/run.sh
@@ -36,6 +36,12 @@ else
 	if [ -d "$CURDIR/lib/rocblas/library" ]; then
 		export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
 	fi
+	# Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its
+	# TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at
+	# the bundled data or it falls back to slow generic kernels (issue #10660).
+	if [ -d "$CURDIR/lib/hipblaslt/library" ]; then
+		export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library
+	fi
 fi

 # If there is a lib/ld.so, use it
--- a/backend/cpp/turboquant/run.sh
+++ b/backend/cpp/turboquant/run.sh
@@ -34,6 +34,12 @@ else
 	if [ -d "$CURDIR/lib/rocblas/library" ]; then
 		export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
 	fi
+	# Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its
+	# TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at
+	# the bundled data or it falls back to slow generic kernels (issue #10660).
+	if [ -d "$CURDIR/lib/hipblaslt/library" ]; then
+		export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library
+	fi
 fi

 # If there is a lib/ld.so, use it
--- a/backend/go/acestep-cpp/CMakeLists.txt
+++ b/backend/go/acestep-cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ target_include_directories(goacestepcpp PRIVATE ${ACESTEP_DIR}/src ${ACESTEP_DIR
 target_include_directories(goacestepcpp SYSTEM PRIVATE ${ACESTEP_DIR}/ggml/include)

 # Link GPU backends if available (mirrors link_ggml_backends macro)
-foreach(backend blas cuda metal vulkan)
+foreach(backend blas cuda hip metal vulkan)
    if(TARGET ggml-${backend})
        target_link_libraries(goacestepcpp PRIVATE ggml-${backend})
        string(TOUPPER ${backend} BACKEND_UPPER)
--- a/backend/go/acestep-cpp/Makefile
+++ b/backend/go/acestep-cpp/Makefile
@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
+	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
+	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=9a26976a8c8cf5af0afcdd04463cf8ba91e96a54
+CRISPASR_VERSION?=f35185b876fc482fcb2053a81a2697936ed5fcc0
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/omnivoice-cpp/CMakeLists.txt
+++ b/backend/go/omnivoice-cpp/CMakeLists.txt
@@ -30,7 +30,7 @@ target_include_directories(gomnivoicecpp PRIVATE ${OMNIVOICE_DIR}/src)
 target_include_directories(gomnivoicecpp SYSTEM PRIVATE ${OMNIVOICE_DIR}/ggml/include)

 # Link GPU backends if the upstream ggml created them.
-foreach(backend blas cuda metal vulkan sycl)
+foreach(backend blas cuda hip metal vulkan sycl)
    if(TARGET ggml-${backend})
        target_link_libraries(gomnivoicecpp PRIVATE ggml-${backend})
        if(backend STREQUAL "cuda")
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
+	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
+	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/go/qwen3-tts-cpp/CMakeLists.txt
+++ b/backend/go/qwen3-tts-cpp/CMakeLists.txt
@@ -30,7 +30,7 @@ target_include_directories(goqwen3ttscpp PRIVATE ${QWENTTS_DIR}/src)
 target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWENTTS_DIR}/ggml/include)

 # Link GPU backends if the upstream ggml created them.
-foreach(backend blas cuda metal vulkan sycl)
+foreach(backend blas cuda hip metal vulkan sycl)
    if(TARGET ggml-${backend})
        target_link_libraries(goqwen3ttscpp PRIVATE ggml-${backend})
        if(backend STREQUAL "cuda")
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
+	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
+	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/go/vibevoice-cpp/CMakeLists.txt
+++ b/backend/go/vibevoice-cpp/CMakeLists.txt
@@ -50,7 +50,7 @@ target_include_directories(govibevoicecpp SYSTEM PRIVATE ${VIBEVOICE_DIR}/third_
 # Link GPU backends if available — vibevoice's own CMake already links
 # these to the libvibevoice STATIC library, but we re-link them on the
 # MODULE so resolved symbols include all backend kernels.
-foreach(backend blas cuda metal vulkan)
+foreach(backend blas cuda hip metal vulkan)
    if(TARGET ggml-${backend})
        target_link_libraries(govibevoicecpp PRIVATE ggml-${backend})
        string(TOUPPER ${backend} BACKEND_UPPER)
--- a/backend/go/vibevoice-cpp/Makefile
+++ b/backend/go/vibevoice-cpp/Makefile
@@ -29,7 +29,14 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DVIBEVOICE_GGML_HIPBLAS=ON
+	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
+	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON -DVIBEVOICE_GGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/python/ace-step/requirements-cpu.txt
+++ b/backend/python/ace-step/requirements-cpu.txt
@@ -4,7 +4,7 @@ torchaudio
 torchvision

 # Core dependencies
-transformers>=5.13.0,<5.14.0
+transformers>=4.51.0,<4.58.0
 diffusers
 gradio
 matplotlib>=3.7.5
--- a/backend/python/ace-step/requirements-cublas12.txt
+++ b/backend/python/ace-step/requirements-cublas12.txt
@@ -4,7 +4,7 @@ torchaudio
 torchvision

 # Core dependencies
-transformers>=5.13.0,<5.14.0
+transformers>=4.51.0,<4.58.0
 diffusers
 gradio>=6.5.1
 matplotlib>=3.7.5
--- a/backend/python/ace-step/requirements-cublas13.txt
+++ b/backend/python/ace-step/requirements-cublas13.txt
@@ -4,7 +4,7 @@ torchaudio
 torchvision

 # Core dependencies
-transformers>=5.13.0,<5.14.0
+transformers>=4.51.0,<4.58.0
 diffusers
 gradio>=6.5.1
 matplotlib>=3.7.5
--- a/backend/python/ace-step/requirements-hipblas.txt
+++ b/backend/python/ace-step/requirements-hipblas.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
-torch==2.12.0+cpu
+torch==2.10.0+rocm7.0
 torchaudio
 torchvision

 # Core dependencies
-transformers>=5.13.0,<5.14.0
+transformers>=4.51.0,<4.58.0
 diffusers
 gradio>=6.5.1
 matplotlib>=3.7.5
--- a/backend/python/ace-step/requirements-intel.txt
+++ b/backend/python/ace-step/requirements-intel.txt
@@ -4,7 +4,7 @@ torchaudio
 torchvision

 # Core dependencies
-transformers>=5.13.0,<5.14.0
+transformers>=4.51.0,<4.58.0
 diffusers
 gradio
 matplotlib>=3.7.5
--- a/backend/python/ace-step/requirements-l4t13.txt
+++ b/backend/python/ace-step/requirements-l4t13.txt
@@ -3,7 +3,7 @@ torch
 torchaudio
 torchvision
 # Core dependencies
-transformers>=5.13.0,<5.14.0
+transformers>=4.51.0,<4.58.0
 diffusers
 gradio>=6.5.1
 matplotlib>=3.7.5
--- a/backend/python/ace-step/requirements-mps.txt
+++ b/backend/python/ace-step/requirements-mps.txt
@@ -3,7 +3,7 @@ torchaudio
 torchvision

 # Core dependencies
-transformers>=5.13.0,<5.14.0
+transformers>=4.51.0,<4.58.0
 diffusers
 gradio
 matplotlib>=3.7.5
--- a/backend/python/rfdetr/requirements-cpu.txt
+++ b/backend/python/rfdetr/requirements-cpu.txt
@@ -3,5 +3,5 @@ opencv-python
 accelerate
 peft
 inference
-torch==2.12.0+cu130
+torch==2.7.1
 optimum-quanto
--- a/backend/python/rfdetr/requirements-cublas12.txt
+++ b/backend/python/rfdetr/requirements-cublas12.txt
@@ -1,4 +1,4 @@
-torch==2.12.0+cu130
+torch==2.7.1
 rfdetr
 opencv-python
 accelerate
--- a/backend/python/rfdetr/requirements-cublas13.txt
+++ b/backend/python/rfdetr/requirements-cublas13.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
-torch==2.12.0+cu130
+torch==2.9.1
 rfdetr
 opencv-python
 accelerate
--- a/backend/python/rfdetr/requirements-hipblas.txt
+++ b/backend/python/rfdetr/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
-torch==2.12.0+cu130
+torch==2.10.0+rocm7.0
 torchvision==0.25.0+rocm7.0
 rfdetr
 opencv-python
--- a/backend/python/rfdetr/requirements-mps.txt
+++ b/backend/python/rfdetr/requirements-mps.txt
@@ -1,4 +1,4 @@
-torch==2.12.0+cu130
+torch==2.7.1
 rfdetr
 opencv-python
 accelerate
--- a/backend/python/sglang/requirements-cpu.txt
+++ b/backend/python/sglang/requirements-cpu.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch==2.12.0+cpu
+torch==2.9.0
 torchvision
 torchaudio
 transformers
--- a/backend/python/sglang/requirements-cublas12.txt
+++ b/backend/python/sglang/requirements-cublas12.txt
@@ -6,7 +6,7 @@
 # for cublas12 so uv consults this index alongside PyPI.
 --extra-index-url https://download.pytorch.org/whl/cu128
 accelerate
-torch==2.12.0+cpu
+torch==2.9.1
 torchvision
 torchaudio
 transformers
--- a/backend/python/trl/requirements-cpu.txt
+++ b/backend/python/trl/requirements-cpu.txt
@@ -1,9 +1,9 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.12.0+cpu
+torch==2.10.0
 trl
 peft
 datasets>=3.0.0
-transformers>=5.13.0
+transformers>=4.56.2
 accelerate>=1.4.0
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/trl/requirements-cublas12.txt
+++ b/backend/python/trl/requirements-cublas12.txt
@@ -1,8 +1,8 @@
-torch==2.12.0+cpu
+torch==2.10.0
 trl
 peft
 datasets>=3.0.0
-transformers>=5.13.0
+transformers>=4.56.2
 accelerate>=1.4.0
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/trl/requirements-cublas13.txt
+++ b/backend/python/trl/requirements-cublas13.txt
@@ -1,8 +1,8 @@
-torch==2.12.0+cpu
+torch==2.10.0
 trl
 peft
 datasets>=3.0.0
-transformers>=5.13.0
+transformers>=4.56.2
 accelerate>=1.4.0
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/trl/requirements-mps.txt
+++ b/backend/python/trl/requirements-mps.txt
@@ -1,8 +1,8 @@
-torch==2.12.0+cpu
+torch==2.10.0
 trl
 peft
 datasets>=3.0.0
-transformers>=5.13.0
+transformers>=4.56.2
 accelerate>=1.4.0
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/vllm-omni/requirements-cublas12.txt
+++ b/backend/python/vllm-omni/requirements-cublas12.txt
@@ -1,4 +1,4 @@
 accelerate
-torch==2.12.0+cu130
+torch==2.7.0
 transformers
 bitsandbytes
--- a/scripts/build/package-gpu-libs-rocm-data_test.sh
+++ b/scripts/build/package-gpu-libs-rocm-data_test.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Regression test for scripts/build/package-gpu-libs.sh ROCm data bundling.
+#
+# Guards issue #10660: hipBLASLt (rocblaslt) resolves its TensileLibrary_lazy_gfx*.dat
+# kernel data relative to the bundled libhipblaslt.so. The packager copied the
+# rocblas/ data dir but not the hipblaslt/ data dir, so the bundled backend
+# fell back to slow generic kernels and logged
+#   rocblaslt error: Cannot read "TensileLibrary_lazy_gfx1201.dat": No such file or directory
+#
+# This test fabricates a fake ROCm tree containing both rocblas/ and hipblaslt/
+# tensile data, points the packager at it via ROCM_BASE_DIRS, and asserts BOTH
+# data directories are bundled into the target lib dir.
+set -euo pipefail
+
+CURDIR=$(dirname "$(realpath "$0")")
+SCRIPT="$CURDIR/package-gpu-libs.sh"
+
+WORK=$(mktemp -d)
+trap 'rm -rf "$WORK"' EXIT
+
+# Fabricate a fake ROCm install with both rocblas and hipblaslt tensile data.
+FAKE_ROCM="$WORK/opt/rocm"
+mkdir -p "$FAKE_ROCM/lib/rocblas/library"
+mkdir -p "$FAKE_ROCM/lib/hipblaslt/library"
+echo "fake rocblas tensile" > "$FAKE_ROCM/lib/rocblas/library/TensileLibrary_lazy_gfx1201.dat"
+echo "fake hipblaslt tensile" > "$FAKE_ROCM/lib/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat"
+
+TARGET="$WORK/target"
+mkdir -p "$TARGET"
+
+# shellcheck source=/dev/null
+source "$SCRIPT" "$TARGET"
+
+# Point the data-dir copy at the fabricated tree instead of the real /opt/rocm,
+# then run the actual ROCm packager. This asserts package_rocm_libs itself
+# bundles BOTH data dirs, not just that the helper works in isolation.
+export BUILD_TYPE=hipblas
+export ROCM_BASE_DIRS="$FAKE_ROCM"
+package_rocm_libs
+
+fail=false
+if [ ! -e "$TARGET/rocblas/library/TensileLibrary_lazy_gfx1201.dat" ]; then
+    echo "FAIL: rocblas tensile data was NOT bundled"
+    fail=true
+fi
+if [ ! -e "$TARGET/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat" ]; then
+    echo "FAIL: hipblaslt tensile data was NOT bundled (regression of #10660)"
+    fail=true
+fi
+
+if [ "$fail" = true ]; then
+    ls -R "$TARGET" || true
+    exit 1
+fi
+
+echo "PASS: rocblas and hipblaslt tensile data were both bundled"
+exit 0
--- a/scripts/build/package-gpu-libs.sh
+++ b/scripts/build/package-gpu-libs.sh
@@ -224,6 +224,50 @@ package_cuda_libs() {
    echo "CUDA libraries packaged successfully"
 }

+# Copy a ROCm library data subdirectory (e.g. rocblas, hipblaslt) into the
+# bundled lib/ dir. These directories hold the TensileLibrary_*.dat GPU kernel
+# tuning files, which rocBLAS/hipBLASLt load at runtime *relative to their own
+# .so*. Since backends ship their own copies of libhipblaslt.so/librocblas.so
+# under lib/, the matching data dir must travel with them or the libs fall back
+# to slow generic kernels (rocblaslt error: Cannot read TensileLibrary_lazy_gfx*.dat;
+# see issue #10660).
+#
+# The ROCm search roots default to /opt/rocm{,-*} but can be overridden via the
+# ROCM_BASE_DIRS env var (space-separated), which keeps the copy unit-testable
+# without a real ROCm install.
+# Args: $1 = data subdir name found under <rocm-root>/lib{,64}/
+copy_rocm_data_dir() {
+    local data_name="$1"
+    # Single-line `local x=$(...)` on purpose: `local` masks the command
+    # substitution's exit status, which is 1 when nullglob is unset and would
+    # otherwise trip the script's `set -e`.
+    local old_nullglob=$(shopt -p nullglob)
+    shopt -s nullglob
+    local rocm_dirs
+    if [ -n "${ROCM_BASE_DIRS:-}" ]; then
+        # shellcheck disable=SC2206  # intentional word-split of the override
+        rocm_dirs=(${ROCM_BASE_DIRS})
+    else
+        rocm_dirs=(/opt/rocm /opt/rocm-*)
+    fi
+    eval "$old_nullglob"
+    local found=false
+    local rocm_base lib_subdir
+    for rocm_base in "${rocm_dirs[@]}"; do
+        for lib_subdir in lib lib64; do
+            if [ -d "$rocm_base/$lib_subdir/$data_name" ]; then
+                echo "Found $data_name data at $rocm_base/$lib_subdir/$data_name"
+                mkdir -p "$TARGET_LIB_DIR/$data_name"
+                cp -arfL "$rocm_base/$lib_subdir/$data_name/"* "$TARGET_LIB_DIR/$data_name/" || echo "WARNING: Failed to copy $data_name data from $rocm_base/$lib_subdir/$data_name"
+                found=true
+            fi
+        done
+    done
+    if [ "$found" = false ]; then
+        echo "WARNING: No $data_name library data found in ${ROCM_BASE_DIRS:-/opt/rocm*}/lib{,64}/$data_name"
+    fi
+}
+
 # Package AMD ROCm/HIPBlas libraries
 package_rocm_libs() {
    echo "Packaging ROCm/HIPBlas libraries for BUILD_TYPE=${BUILD_TYPE}..."
@@ -267,27 +311,16 @@ package_rocm_libs() {
        fi
    done

-    # Copy rocblas library data (tuning files, TensileLibrary, etc.)
-    local old_nullglob=$(shopt -p nullglob)
-    shopt -s nullglob
-    local rocm_dirs=(/opt/rocm /opt/rocm-*)
-    eval "$old_nullglob"
-    local rocblas_found=false
-    for rocm_base in "${rocm_dirs[@]}"; do
-        for lib_subdir in lib lib64; do
-            if [ -d "$rocm_base/$lib_subdir/rocblas" ]; then
-                echo "Found rocblas data at $rocm_base/$lib_subdir/rocblas"
-                mkdir -p "$TARGET_LIB_DIR/rocblas"
-                cp -arfL "$rocm_base/$lib_subdir/rocblas/"* "$TARGET_LIB_DIR/rocblas/" || echo "WARNING: Failed to copy rocblas data from $rocm_base/$lib_subdir/rocblas"
-                rocblas_found=true
-            fi
-        done
-    done
-    if [ "$rocblas_found" = false ]; then
-        echo "WARNING: No rocblas library data found in /opt/rocm*/lib{,64}/rocblas"
-    fi
+    # Copy rocBLAS and hipBLASLt kernel data (TensileLibrary_*.dat tuning files)
+    # so the bundled libs find their per-arch kernels at runtime instead of
+    # falling back to slow generic code (see copy_rocm_data_dir / issue #10660).
+    copy_rocm_data_dir rocblas
+    copy_rocm_data_dir hipblaslt

    # Copy libomp from LLVM (required for ROCm)
+    # Single-line `local x=$(...)` on purpose: masks shopt -p's nonzero exit
+    # (nullglob unset) so it doesn't trip `set -e`.
+    local old_nullglob=$(shopt -p nullglob)
    shopt -s nullglob
    local omp_libs=(/opt/rocm*/lib/llvm/lib/libomp.so*)
    eval "$old_nullglob"
@@ -477,6 +510,7 @@ export -f copy_libs_glob
 export -f is_core_lib
 export -f copy_elf_deps
 export -f sweep_transitive_deps
+export -f copy_rocm_data_dir
 export -f package_cuda_libs
 export -f package_rocm_libs
 export -f package_intel_libs
Author	SHA1	Message	Date
LocalAI [bot]	38350d363e	fix(backends): enable ROCm/HIP GPU offload for ggml audio backends (#10666 ) (#10667 ) qwen3-tts-cpp, omnivoice-cpp, acestep-cpp and vibevoice-cpp shipped rocm-* variants that silently ran on CPU ([Load] backend: CPU). Two coupled defects: - The Makefiles passed -DGGML_HIPBLAS=ON, but the vendored ggml only understands -DGGML_HIP=ON (GGML_HIPBLAS was removed upstream), so the ggml-hip backend target was never created and no GPU code was built. - The CMake foreach that links the ggml GPU backends into the module listed blas/cuda/metal/vulkan but not hip, so even a built ggml-hip would not have been linked and its static backend registration would never run. CUDA users were unaffected because cublas passes the correct GGML_CUDA=ON and the foreach already links cuda. Mirror the proven llama-cpp hipblas block (ROCm clang CC/CXX + AMDGPU_TARGETS) and add hip to each foreach. Upstream picks the best device via ggml_backend_init_best(), so no runtime flag is needed once HIP is compiled and linked. Assisted-by: Claude:claude-opus-4-8[1m] [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>	2026-07-04 09:08:20 +02:00
LocalAI [bot]	817136c20e	chore: ⬆️ Update CrispStrobe/CrispASR to `f35185b876fc482fcb2053a81a2697936ed5fcc0` (#10670 ) ⬆️ Update CrispStrobe/CrispASR Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-07-04 08:17:02 +02:00
LocalAI [bot]	8396ce1388	chore: ⬆️ Update ggml-org/llama.cpp to `d4cff114c0084f1fbc9b4c62717eca8fb2ae494a` (#10671 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-07-04 08:16:41 +02:00
LocalAI [bot]	348f3c87c0	fix(gpu-libs): bundle hipBLASLt TensileLibrary data so ROCm backends stop falling back (#10660 ) (#10672 ) the The ROCm packager copied rocBLAS kernel data (rocblas/library/.dat) into the bundled lib/ dir and run.sh pointed ROCBLAS_TENSILE_LIBPATH at it, but the parallel hipBLASLt data dir (hipblaslt/library/TensileLibrary_lazy_gfx.dat) was never packaged and no HIPBLASLT_TENSILE_LIBPATH was set. The bundled libhipblaslt.so therefore resolved its per-arch kernel data relative to itself, found nothing, and silently fell back to slow generic kernels, logging: rocblaslt error: Cannot read "TensileLibrary_lazy_gfx1201.dat": No such file or directory rocblaslt error: Could not load "TensileLibrary_lazy_gfx1201.dat" Fix, mirroring the existing rocBLAS handling: - package-gpu-libs.sh: extract the rocblas data-dir copy into a reusable copy_rocm_data_dir helper and call it for both rocblas and hipblaslt. - llama-cpp/turboquant run.sh: export HIPBLASLT_TENSILE_LIBPATH when the bundled hipblaslt/library dir exists. The helper takes an optional ROCM_BASE_DIRS override so the copy is unit testable without a real ROCm install; add a regression test that runs package_rocm_libs against a fabricated ROCm tree and asserts both data dirs are bundled. Note: this bundles whatever gfx*.dat the build image's ROCm provides. If a given arch's tensile data is absent from the shipped ROCm, that arch still needs a ROCm bump; the packaging gap itself is fixed for every supported arch. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>	2026-07-04 08:14:12 +02:00