diff --git a/backend/Dockerfile.golang b/backend/Dockerfile.golang
index d188cdf70..13032fa22 100644
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -137,7 +137,7 @@ RUN <<EOT bash
             libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
         if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
             apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
+            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
         fi
         apt-get clean && \
         rm -rf /var/lib/apt/lists/*
diff --git a/backend/go/face-detect/Makefile b/backend/go/face-detect/Makefile
index 14d723054..db05eb8c3 100644
--- a/backend/go/face-detect/Makefile
+++ b/backend/go/face-detect/Makefile
@@ -24,6 +24,10 @@ JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 BUILD_TYPE?=
 NATIVE?=false
 
+# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
+# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
+RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
+
 # Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
 # so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
 # only system libs (libstdc++/libgomp/libc) the runtime image already provides.
@@ -41,6 +45,15 @@ endif
 # options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
+	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x
+	# vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
+	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
+	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
+	ifeq ($(CUDA_MAJOR_VERSION),13)
+	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
+		CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON
+	endif
+	endif
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)
diff --git a/backend/go/voice-detect/Makefile b/backend/go/voice-detect/Makefile
index dcd490b10..3d1079f8f 100644
--- a/backend/go/voice-detect/Makefile
+++ b/backend/go/voice-detect/Makefile
@@ -23,6 +23,10 @@ JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 BUILD_TYPE?=
 NATIVE?=false
 
+# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
+# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
+RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
+
 # Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
 # self-contained: dlopen needs no libggml*.so alongside it, only system libs
 # (libstdc++/libgomp/libc) that the runtime image already provides.
@@ -38,6 +42,15 @@ endif
 # options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
+	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches
+	# torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
+	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
+	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
+	ifeq ($(CUDA_MAJOR_VERSION),13)
+	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
+		CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON
+	endif
+	endif
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)