mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-24 16:49:06 -04:00
feat(recon): enable cuDNN conv path on arm64+CUDA13 recon backends
The voice-detect.cpp / face-detect.cpp engines have an opt-in cuDNN implicit-GEMM conv path behind VOICEDETECT_GGML_CUDNN / FACEDETECT_GGML_CUDNN (default OFF) that kills im2col on the GPU and reaches torch-cuDNN parity (SCRFD 2.3x, WeSpeaker/ERes2Net parity), measured on the GB10 (arm64, CUDA 13, sm_121a). Enable it for the CUDA build, but only where cuDNN actually ships: the arm64 + CUDA 13 image (GB10/Jetson/L4T). x86 CUDA images carry no cuDNN, so flipping it on globally for BUILD_TYPE=cublas would be a link failure. The Makefiles gate on CUDA_MAJOR_VERSION=13 + arch (TARGETARCH from the matrix/Docker build, uname -m fallback for local builds). backend/Dockerfile.golang already installs the runtime libcudnn9-cuda-13 in the arm64+CUDA13 apt block; add the matching libcudnn9-dev-cuda-13 so the build-time link resolves. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
This commit is contained in:
@@ -137,7 +137,7 @@ RUN <<EOT bash
|
||||
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
|
||||
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
|
||||
apt-get install -y --no-install-recommends \
|
||||
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
|
||||
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
|
||||
fi
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
@@ -24,6 +24,10 @@ JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||
BUILD_TYPE?=
|
||||
NATIVE?=false
|
||||
|
||||
# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
|
||||
# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
|
||||
RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
|
||||
|
||||
# Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
|
||||
# so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
|
||||
# only system libs (libstdc++/libgomp/libc) the runtime image already provides.
|
||||
@@ -41,6 +45,15 @@ endif
|
||||
# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
|
||||
# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x
|
||||
# vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
|
||||
# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
|
||||
# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
|
||||
ifeq ($(CUDA_MAJOR_VERSION),13)
|
||||
ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
|
||||
CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(BUILD_TYPE),openblas)
|
||||
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
|
||||
@@ -23,6 +23,10 @@ JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||
BUILD_TYPE?=
|
||||
NATIVE?=false
|
||||
|
||||
# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
|
||||
# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
|
||||
RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
|
||||
|
||||
# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
|
||||
# self-contained: dlopen needs no libggml*.so alongside it, only system libs
|
||||
# (libstdc++/libgomp/libc) that the runtime image already provides.
|
||||
@@ -38,6 +42,15 @@ endif
|
||||
# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
|
||||
# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches
|
||||
# torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
|
||||
# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
|
||||
# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
|
||||
ifeq ($(CUDA_MAJOR_VERSION),13)
|
||||
ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
|
||||
CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(BUILD_TYPE),openblas)
|
||||
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
|
||||
Reference in New Issue
Block a user