From 606128e4e9aeabacb4a029799d4ee6e0d970250a Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 19 Jun 2026 16:16:33 +0100 Subject: [PATCH] feat(vulkan): make Vulkan backends self-contained on the GPU (#10404) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vulkan backends bundled their own loader and ICD manifests but neither the Mesa driver the manifests point at nor a way to make the loader find them, so on a runtime base image without Mesa the loader enumerated zero devices and the GPU silently fell back to CPU (only NVIDIA worked, since its ICD is injected by the container toolkit). - scripts/build/package-gpu-libs.sh: for each installed ICD manifest, bundle the driver .so its library_path names — no hard-coded, platform-dependent soname list — plus that driver's ldd dependencies, skipping manifests whose driver isn't installed. Rewrite each library_path to a bare soname so the bundled driver resolves via the LD_LIBRARY_PATH run.sh already sets. - .docker/install-base-deps.sh, backend/Dockerfile.golang, backend/Dockerfile.python: install mesa-vulkan-drivers in every Vulkan builder so the driver + manifests exist to be packaged (the LunarG SDK ships only the loader and shader tooling). - pkg/model/process.go: when a backend ships vulkan/icd.d/, point the loader at it via VK_DRIVER_FILES/VK_ICD_FILENAMES at launch (no-op otherwise). Covered by pkg/model/process_vulkan_test.go. - backend/go/parakeet-cpp/package.sh: complete the L0 stub (was missing the libc-family ldd walk + GPU-lib packaging) by mirroring whisper, so the vulkan-parakeet image actually bundles its GPU runtime. Assisted-by: Claude Code:claude-opus-4-8 Signed-off-by: Richard Palethorpe --- .docker/install-base-deps.sh | 6 ++ backend/Dockerfile.golang | 7 ++- backend/Dockerfile.python | 7 ++- backend/go/parakeet-cpp/package.sh | 55 +++++++++++++++-- pkg/model/process.go | 46 ++++++++++++++- pkg/model/process_vulkan_test.go | 58 ++++++++++++++++++ scripts/build/package-gpu-libs.sh | 95 ++++++++++++++++++++++++++++-- 7 files changed, 262 insertions(+), 12 deletions(-) create mode 100644 pkg/model/process_vulkan_test.go diff --git a/.docker/install-base-deps.sh b/.docker/install-base-deps.sh index 5b0908fa8..2b0e7e0c6 100755 --- a/.docker/install-base-deps.sh +++ b/.docker/install-base-deps.sh @@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \ ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \ clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils + # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD + # manifests. The LunarG SDK below only provides the loader and shader + # tooling, not hardware drivers — without Mesa the packaged Vulkan backend + # would ship a loader that finds no GPU. package-gpu-libs.sh bundles these + # .so files plus their deps into the backend so it stays self-contained. + apt-get install -y mesa-vulkan-drivers libdrm2 if [ "amd64" = "${TARGETARCH:-}" ]; then wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz diff --git a/backend/Dockerfile.golang b/backend/Dockerfile.golang index 75fc3a0d9..d188cdf70 100644 --- a/backend/Dockerfile.golang +++ b/backend/Dockerfile.golang @@ -65,7 +65,12 @@ RUN </dev/null || { echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2 exit 1 } -echo "L0 package layout (full ldd walk lands in L3):" +# Detect architecture and copy the core runtime libs libparakeet.so links +# against, plus the matching dynamic loader as lib/ld.so. +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so" + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2" + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0" +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so" + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2" + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0" +elif [ "$(uname -s)" = "Darwin" ]; then + echo "Detected Darwin" +else + echo "Error: Could not detect architecture" + exit 1 +fi + +# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) +# based on BUILD_TYPE so the backend can reach the GPU without the runtime +# base image shipping those drivers. +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" ls -liah "$CURDIR/package/" "$CURDIR/package/lib/" diff --git a/pkg/model/process.go b/pkg/model/process.go index 60f8d318e..95e3e0758 100644 --- a/pkg/model/process.go +++ b/pkg/model/process.go @@ -154,11 +154,20 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string return nil, err } + env := os.Environ() + // Vulkan backends are self-contained: they bundle their own loader and + // Mesa driver .so files in lib/ plus the matching ICD manifests in + // vulkan/icd.d/. Point the loader at those manifests so it doesn't rely on + // the runtime base image shipping a Vulkan driver (it carries the + // SYCL/Level-Zero stack instead, so the default ICD search path is empty + // and the GPU would silently fall back to CPU). No-op for other backends. + env = append(env, vulkanICDEnv(workDir)...) + grpcControlProcess := process.New( process.WithTemporaryStateDir(), process.WithName(filepath.Base(grpcProcess)), process.WithArgs(append(args, []string{"--addr", serverAddress}...)...), - process.WithEnvironment(os.Environ()...), + process.WithEnvironment(env...), process.WithWorkDir(workDir), ) @@ -249,3 +258,38 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string return grpcControlProcess, nil } + +// vulkanICDEnv returns environment overrides that point the Vulkan loader at +// the ICD manifests a backend bundles in /vulkan/icd.d. Vulkan +// backends ship a self-contained stack — their own loader and Mesa driver .so +// files in lib/ (resolved via the LD_LIBRARY_PATH that run.sh sets) plus the +// matching ICD manifests — so the loader must be told where those manifests +// live; its default search path (/usr/share/vulkan/icd.d, /etc/vulkan/icd.d) +// is empty on the runtime base image. Returns nil when the directory holds no +// manifests (CPU/CUDA/SYCL builds), leaving the host's Vulkan setup untouched. +func vulkanICDEnv(workDir string) []string { + icdDir := filepath.Join(workDir, "vulkan", "icd.d") + entries, err := os.ReadDir(icdDir) + if err != nil { + return nil + } + + manifests := make([]string, 0, len(entries)) + for _, e := range entries { + if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") { + continue + } + manifests = append(manifests, filepath.Join(icdDir, e.Name())) + } + if len(manifests) == 0 { + return nil + } + + list := strings.Join(manifests, string(os.PathListSeparator)) + // VK_DRIVER_FILES is the current loader variable; VK_ICD_FILENAMES is its + // deprecated alias, set too so older bundled loaders still pick it up. + return []string{ + "VK_DRIVER_FILES=" + list, + "VK_ICD_FILENAMES=" + list, + } +} diff --git a/pkg/model/process_vulkan_test.go b/pkg/model/process_vulkan_test.go new file mode 100644 index 000000000..c3bb108a7 --- /dev/null +++ b/pkg/model/process_vulkan_test.go @@ -0,0 +1,58 @@ +package model + +import ( + "os" + "path/filepath" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("vulkanICDEnv", func() { + It("returns nil when the backend ships no vulkan/icd.d (CPU/CUDA/SYCL builds)", func() { + Expect(vulkanICDEnv(GinkgoT().TempDir())).To(BeNil()) + }) + + It("returns nil when icd.d exists but holds no .json manifests", func() { + work := GinkgoT().TempDir() + icdDir := filepath.Join(work, "vulkan", "icd.d") + Expect(os.MkdirAll(icdDir, 0o755)).To(Succeed()) + Expect(os.WriteFile(filepath.Join(icdDir, "README.txt"), []byte("not a manifest"), 0o644)).To(Succeed()) + // A directory whose name ends in .json must be ignored. + Expect(os.MkdirAll(filepath.Join(icdDir, "nested.json"), 0o755)).To(Succeed()) + + Expect(vulkanICDEnv(work)).To(BeNil()) + }) + + It("points VK_DRIVER_FILES/VK_ICD_FILENAMES at the bundled manifests", func() { + work := GinkgoT().TempDir() + icdDir := filepath.Join(work, "vulkan", "icd.d") + Expect(os.MkdirAll(icdDir, 0o755)).To(Succeed()) + for _, name := range []string{"intel_icd.json", "lvp_icd.json"} { + Expect(os.WriteFile(filepath.Join(icdDir, name), []byte("{}"), 0o644)).To(Succeed()) + } + + env := vulkanICDEnv(work) + Expect(env).To(HaveLen(2)) + + got := map[string]string{} + for _, kv := range env { + k, v, ok := strings.Cut(kv, "=") + Expect(ok).To(BeTrue(), "malformed env entry %q", kv) + got[k] = v + } + + for _, key := range []string{"VK_DRIVER_FILES", "VK_ICD_FILENAMES"} { + Expect(got).To(HaveKey(key)) + // Both manifests must be listed as absolute paths, joined by the + // OS path-list separator the Vulkan loader expects. + parts := strings.Split(got[key], string(os.PathListSeparator)) + Expect(parts).To(HaveLen(2)) + for _, p := range parts { + Expect(filepath.IsAbs(p)).To(BeTrue(), "%s entry %q must be absolute", key, p) + Expect(p).To(HaveSuffix(".json")) + } + } + }) +}) diff --git a/scripts/build/package-gpu-libs.sh b/scripts/build/package-gpu-libs.sh index 2b5b02aab..40f410173 100755 --- a/scripts/build/package-gpu-libs.sh +++ b/scripts/build/package-gpu-libs.sh @@ -109,6 +109,38 @@ copy_libs_glob() { done } +# Returns success for the core runtime libs the base image and package.sh +# already provide. We must NOT bundle our own copies of these — a second libc +# or libstdc++ on LD_LIBRARY_PATH clashes with the loader and the rest of the +# process — so they're skipped when pulling in a driver's transitive deps. +is_core_lib() { + case "$1" in + ld-linux*|ld.so|libc.so.*|libm.so.*|libdl.so.*|libpthread.so.*|librt.so.*|\ + libgcc_s.so.*|libstdc++.so.*|libresolv.so.*|libutil.so.*|linux-vdso.so.*) + return 0 ;; + esac + return 1 +} + +# Copy the shared-library dependencies of an ELF file into TARGET_LIB_DIR. +# Used to make a bundled GPU driver self-contained: e.g. the Mesa Vulkan ICDs +# pull in libdrm, libexpat and (for RADV/lavapipe) libLLVM, none of which the +# runtime base image is guaranteed to have. Core libc-family deps are skipped. +copy_elf_deps() { + local elf="$1" + [ -e "$elf" ] || return 0 + command -v ldd >/dev/null 2>&1 || return 0 + + # ldd lines look like: "libfoo.so.1 => /path/to/libfoo.so.1 (0x..)". + # Take the resolved absolute path (field 3) and skip vdso/static entries. + while read -r dep; do + if is_core_lib "$(basename "$dep")"; then + continue + fi + copy_lib "$dep" + done < <(ldd "$elf" 2>/dev/null | awk '/=>/ && $3 ~ /^\// {print $3}') +} + # Package NVIDIA CUDA libraries package_cuda_libs() { echo "Packaging CUDA libraries for BUILD_TYPE=${BUILD_TYPE}..." @@ -284,7 +316,7 @@ package_vulkan_libs() { "/usr/local/lib" ) - # Core Vulkan runtime libraries + # Core Vulkan runtime: the loader plus the shader tooling shipped by the SDK. local vulkan_libs=( "libvulkan.so*" "libshaderc_shared.so*" @@ -301,10 +333,63 @@ package_vulkan_libs() { fi done - # Copy Vulkan ICD files + # Bundle the ICD drivers. Rather than hard-code Mesa's (platform- and + # version-dependent) driver sonames, treat each installed ICD manifest as + # the source of truth: every /usr/share/vulkan/icd.d/*.json names the exact + # driver .so it needs in its "library_path". So we copy whatever drivers + # the manifests reference (libvulkan_intel/radeon/lvp/... on amd64, the SoC + # drivers on arm64, ...) plus each driver's transitive deps, and skip any + # manifest whose driver isn't actually installed. The loader picks the + # right driver for the GPU at runtime. if [ -d "/usr/share/vulkan/icd.d" ]; then - mkdir -p "$TARGET_LIB_DIR/../vulkan/icd.d" - cp -arfL /usr/share/vulkan/icd.d/* "$TARGET_LIB_DIR/../vulkan/icd.d/" 2>/dev/null || true + local icd_dest="$TARGET_LIB_DIR/../vulkan/icd.d" + mkdir -p "$icd_dest" + + local manifest driver driver_base resolved lib_path + for manifest in /usr/share/vulkan/icd.d/*.json; do + [ -e "$manifest" ] || continue + + # Pull the driver path out of "library_path": "". + driver=$(sed -nE 's/.*"library_path"[[:space:]]*:[[:space:]]*"([^"]+)".*/\1/p' "$manifest" | head -n1) + [ -n "$driver" ] || continue + driver_base=$(basename "$driver") + + # Resolve to an absolute path: honour an absolute library_path, + # else look in the standard lib dirs, else fall back to ldconfig. + resolved="" + case "$driver" in + /*) [ -e "$driver" ] && resolved="$driver" ;; + esac + if [ -z "$resolved" ]; then + for lib_path in "${vulkan_lib_paths[@]}"; do + if [ -e "${lib_path}/${driver_base}" ]; then + resolved="${lib_path}/${driver_base}" + break + fi + done + fi + if [ -z "$resolved" ] && command -v ldconfig >/dev/null 2>&1; then + resolved=$(ldconfig -p | awk -v n="$driver_base" '$1 == n { print $NF; exit }') + fi + + if [ -z "$resolved" ] || [ ! -e "$resolved" ]; then + echo "Vulkan ICD: driver '$driver_base' for $(basename "$manifest") not installed; skipping its manifest" >&2 + continue + fi + + # Bundle the driver + its transitive deps (libdrm, libexpat, and + # libLLVM for RADV/lavapipe, ...) so the backend is self-contained + # on a runtime base image without Mesa. + copy_lib "$resolved" + copy_elf_deps "$resolved" + + # Copy the manifest and rewrite its library_path to a bare soname + # so the loader resolves our bundled driver via LD_LIBRARY_PATH + # (run.sh adds lib/ to it) instead of a host path that won't exist + # on the runtime image. + cp -arfL "$manifest" "$icd_dest/" 2>/dev/null || true + sed -i -E 's#("library_path"[[:space:]]*:[[:space:]]*")[^"]*/#\1#' "$icd_dest/$(basename "$manifest")" + done fi echo "Vulkan libraries packaged successfully" @@ -345,6 +430,8 @@ package_gpu_libs() { export -f package_gpu_libs export -f copy_lib export -f copy_libs_glob +export -f is_core_lib +export -f copy_elf_deps export -f package_cuda_libs export -f package_rocm_libs export -f package_intel_libs