feat(recon): enable cuDNN conv path on arm64+CUDA13 recon backends

The voice-detect.cpp / face-detect.cpp engines have an opt-in cuDNN
implicit-GEMM conv path behind VOICEDETECT_GGML_CUDNN / FACEDETECT_GGML_CUDNN
(default OFF) that kills im2col on the GPU and reaches torch-cuDNN parity
(SCRFD 2.3x, WeSpeaker/ERes2Net parity), measured on the GB10
(arm64, CUDA 13, sm_121a).

Enable it for the CUDA build, but only where cuDNN actually ships: the
arm64 + CUDA 13 image (GB10/Jetson/L4T). x86 CUDA images carry no cuDNN,
so flipping it on globally for BUILD_TYPE=cublas would be a link failure.
The Makefiles gate on CUDA_MAJOR_VERSION=13 + arch (TARGETARCH from the
matrix/Docker build, uname -m fallback for local builds).

backend/Dockerfile.golang already installs the runtime libcudnn9-cuda-13
in the arm64+CUDA13 apt block; add the matching libcudnn9-dev-cuda-13 so
the build-time link resolves.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]
This commit is contained in:
Ettore Di Giacinto
2026-06-24 15:54:12 +00:00
parent 9684c5dd7e
commit 7cbb743b25
3 changed files with 27 additions and 1 deletions

View File

@@ -137,7 +137,7 @@ RUN <<EOT bash
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi
apt-get clean && \
rm -rf /var/lib/apt/lists/*

View File

@@ -24,6 +24,10 @@ JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
BUILD_TYPE?=
NATIVE?=false
# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
# Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
# so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
# only system libs (libstdc++/libgomp/libc) the runtime image already provides.
@@ -41,6 +45,15 @@ endif
# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x
# vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
ifeq ($(CUDA_MAJOR_VERSION),13)
ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON
endif
endif
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
else ifeq ($(BUILD_TYPE),hipblas)

View File

@@ -23,6 +23,10 @@ JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
BUILD_TYPE?=
NATIVE?=false
# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
# self-contained: dlopen needs no libggml*.so alongside it, only system libs
# (libstdc++/libgomp/libc) that the runtime image already provides.
@@ -38,6 +42,15 @@ endif
# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches
# torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
ifeq ($(CUDA_MAJOR_VERSION),13)
ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON
endif
endif
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
else ifeq ($(BUILD_TYPE),hipblas)