feat(recon): enable cuDNN conv path on arm64+CUDA13 recon backends

The voice-detect.cpp / face-detect.cpp engines have an opt-in cuDNN implicit-GEMM conv path behind VOICEDETECT_GGML_CUDNN / FACEDETECT_GGML_CUDNN (default OFF) that kills im2col on the GPU and reaches torch-cuDNN parity (SCRFD 2.3x, WeSpeaker/ERes2Net parity), measured on the GB10 (arm64, CUDA 13, sm_121a). Enable it for the CUDA build, but only where cuDNN actually ships: the arm64 + CUDA 13 image (GB10/Jetson/L4T). x86 CUDA images carry no cuDNN, so flipping it on globally for BUILD_TYPE=cublas would be a link failure. The Makefiles gate on CUDA_MAJOR_VERSION=13 + arch (TARGETARCH from the matrix/Docker build, uname -m fallback for local builds). backend/Dockerfile.golang already installs the runtime libcudnn9-cuda-13 in the arm64+CUDA13 apt block; add the matching libcudnn9-dev-cuda-13 so the build-time link resolves. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
2026-06-24 16:49:06 -04:00 · 2026-06-24 15:54:12 +00:00
parent 9684c5dd7e
commit 7cbb743b25
3 changed files with 27 additions and 1 deletions
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -137,7 +137,7 @@ RUN <<EOT bash
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
+            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
--- a/backend/go/face-detect/Makefile
+++ b/backend/go/face-detect/Makefile
@@ -24,6 +24,10 @@ JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 BUILD_TYPE?=
 NATIVE?=false

+# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
+# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
+RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
+
 # Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
 # so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
 # only system libs (libstdc++/libgomp/libc) the runtime image already provides.
@@ -41,6 +45,15 @@ endif
 # options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
+	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x
+	# vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
+	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
+	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
+	ifeq ($(CUDA_MAJOR_VERSION),13)
+	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
+		CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON
+	endif
+	endif
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)
--- a/backend/go/voice-detect/Makefile
+++ b/backend/go/voice-detect/Makefile
@@ -23,6 +23,10 @@ JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 BUILD_TYPE?=
 NATIVE?=false

+# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
+# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
+RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
+
 # Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
 # self-contained: dlopen needs no libggml*.so alongside it, only system libs
 # (libstdc++/libgomp/libc) that the runtime image already provides.
@@ -38,6 +42,15 @@ endif
 # options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
+	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches
+	# torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
+	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
+	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
+	ifeq ($(CUDA_MAJOR_VERSION),13)
+	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
+		CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON
+	endif
+	endif
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)