From 7cbb743b2549200bee5ef6f228febcc96e6cb28a Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 24 Jun 2026 15:54:12 +0000 Subject: [PATCH] feat(recon): enable cuDNN conv path on arm64+CUDA13 recon backends The voice-detect.cpp / face-detect.cpp engines have an opt-in cuDNN implicit-GEMM conv path behind VOICEDETECT_GGML_CUDNN / FACEDETECT_GGML_CUDNN (default OFF) that kills im2col on the GPU and reaches torch-cuDNN parity (SCRFD 2.3x, WeSpeaker/ERes2Net parity), measured on the GB10 (arm64, CUDA 13, sm_121a). Enable it for the CUDA build, but only where cuDNN actually ships: the arm64 + CUDA 13 image (GB10/Jetson/L4T). x86 CUDA images carry no cuDNN, so flipping it on globally for BUILD_TYPE=cublas would be a link failure. The Makefiles gate on CUDA_MAJOR_VERSION=13 + arch (TARGETARCH from the matrix/Docker build, uname -m fallback for local builds). backend/Dockerfile.golang already installs the runtime libcudnn9-cuda-13 in the arm64+CUDA13 apt block; add the matching libcudnn9-dev-cuda-13 so the build-time link resolves. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --- backend/Dockerfile.golang | 2 +- backend/go/face-detect/Makefile | 13 +++++++++++++ backend/go/voice-detect/Makefile | 13 +++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/backend/Dockerfile.golang b/backend/Dockerfile.golang index d188cdf70..13032fa22 100644 --- a/backend/Dockerfile.golang +++ b/backend/Dockerfile.golang @@ -137,7 +137,7 @@ RUN </dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) BUILD_TYPE?= NATIVE?=false +# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH +# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build. +RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m)) + # Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC) # so the shared lib is self-contained: dlopen needs no libggml*.so alongside it, # only system libs (libstdc++/libgomp/libc) the runtime image already provides. @@ -41,6 +45,15 @@ endif # options instead. (openblas is not gated, so -DGGML_BLAS passes through.) ifeq ($(BUILD_TYPE),cublas) CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON + # Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x + # vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T) + # ships libcudnn9 + the -dev headers, so gate cuDNN to that variant. + # x86 CUDA images carry no cuDNN -> enabling it there is a link failure. + ifeq ($(CUDA_MAJOR_VERSION),13) + ifneq (,$(filter arm64 aarch64,$(RECON_ARCH))) + CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON + endif + endif else ifeq ($(BUILD_TYPE),openblas) CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS else ifeq ($(BUILD_TYPE),hipblas) diff --git a/backend/go/voice-detect/Makefile b/backend/go/voice-detect/Makefile index dcd490b10..3d1079f8f 100644 --- a/backend/go/voice-detect/Makefile +++ b/backend/go/voice-detect/Makefile @@ -23,6 +23,10 @@ JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) BUILD_TYPE?= NATIVE?=false +# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH +# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build. +RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m)) + # Build ggml statically into libvoicedetect.so (PIC) so the shared lib is # self-contained: dlopen needs no libggml*.so alongside it, only system libs # (libstdc++/libgomp/libc) that the runtime image already provides. @@ -38,6 +42,15 @@ endif # options instead. (openblas is not gated, so -DGGML_BLAS passes through.) ifeq ($(BUILD_TYPE),cublas) CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON + # Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches + # torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T) + # ships libcudnn9 + the -dev headers, so gate cuDNN to that variant. + # x86 CUDA images carry no cuDNN -> enabling it there is a link failure. + ifeq ($(CUDA_MAJOR_VERSION),13) + ifneq (,$(filter arm64 aarch64,$(RECON_ARCH))) + CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON + endif + endif else ifeq ($(BUILD_TYPE),openblas) CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS else ifeq ($(BUILD_TYPE),hipblas)