feat(vulkan): make Vulkan backends self-contained on the GPU (#10404)

Vulkan backends bundled their own loader and ICD manifests but neither the
Mesa driver the manifests point at nor a way to make the loader find them,
so on a runtime base image without Mesa the loader enumerated zero devices
and the GPU silently fell back to CPU (only NVIDIA worked, since its ICD is
injected by the container toolkit).

- scripts/build/package-gpu-libs.sh: for each installed ICD manifest, bundle
  the driver .so its library_path names — no hard-coded, platform-dependent
  soname list — plus that driver's ldd dependencies, skipping manifests whose
  driver isn't installed. Rewrite each library_path to a bare soname so the
  bundled driver resolves via the LD_LIBRARY_PATH run.sh already sets.
- .docker/install-base-deps.sh, backend/Dockerfile.golang,
  backend/Dockerfile.python: install mesa-vulkan-drivers in every Vulkan
  builder so the driver + manifests exist to be packaged (the LunarG SDK
  ships only the loader and shader tooling).
- pkg/model/process.go: when a backend ships vulkan/icd.d/, point the loader
  at it via VK_DRIVER_FILES/VK_ICD_FILENAMES at launch (no-op otherwise).
  Covered by pkg/model/process_vulkan_test.go.
- backend/go/parakeet-cpp/package.sh: complete the L0 stub (was missing the
  libc-family ldd walk + GPU-lib packaging) by mirroring whisper, so the
  vulkan-parakeet image actually bundles its GPU runtime.

Assisted-by: Claude Code:claude-opus-4-8

Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
Richard Palethorpe
2026-06-19 16:16:33 +01:00
committed by GitHub
parent 59c7ad5153
commit 606128e4e9
7 changed files with 262 additions and 12 deletions

View File

@@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD
# manifests. The LunarG SDK below only provides the loader and shader
# tooling, not hardware drivers — without Mesa the packaged Vulkan backend
# would ship a loader that finds no GPU. package-gpu-libs.sh bundles these
# .so files plus their deps into the backend so it stays self-contained.
apt-get install -y mesa-vulkan-drivers libdrm2
if [ "amd64" = "${TARGETARCH:-}" ]; then
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz

View File

@@ -65,7 +65,12 @@ RUN <<EOT bash
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
apt-get install -y mesa-vulkan-drivers libdrm2
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
# LunarG SDK below only provides the loader and shader tooling, not
# hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
# bundle and the packaged backend finds no GPU at runtime.
if [ "amd64" = "$TARGETARCH" ]; then
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \

View File

@@ -66,7 +66,12 @@ RUN <<EOT bash
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
apt-get install -y mesa-vulkan-drivers libdrm2
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
# LunarG SDK below only provides the loader and shader tooling, not
# hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
# bundle and the packaged backend finds no GPU at runtime.
if [ "amd64" = "$TARGETARCH" ]; then
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \

View File

@@ -1,23 +1,68 @@
#!/bin/bash
#
# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
# Bundle the parakeet-cpp-grpc binary, libparakeet.so, the core runtime
# libs (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active
# BUILD_TYPE so the package is self-contained. Mirrors
# backend/go/whisper/package.sh; run.sh routes the (CGO_ENABLED=0) binary
# through lib/ld.so so the packaged libc is used instead of the host's.
set -e
CURDIR=$(dirname "$(realpath "$0")")
REPO_ROOT="${CURDIR}/../../.."
mkdir -p "$CURDIR/package/lib"
cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
# resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
exit 1
}
echo "L0 package layout (full ldd walk lands in L3):"
# Detect architecture and copy the core runtime libs libparakeet.so links
# against, plus the matching dynamic loader as lib/ld.so.
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
echo "Detected x86_64 architecture, copying x86_64 libraries..."
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
echo "Detected ARM64 architecture, copying ARM64 libraries..."
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
elif [ "$(uname -s)" = "Darwin" ]; then
echo "Detected Darwin"
else
echo "Error: Could not detect architecture"
exit 1
fi
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers)
# based on BUILD_TYPE so the backend can reach the GPU without the runtime
# base image shipping those drivers.
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
if [ -f "$GPU_LIB_SCRIPT" ]; then
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
package_gpu_libs
fi
echo "Packaging completed successfully"
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"

View File

@@ -154,11 +154,20 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
return nil, err
}
env := os.Environ()
// Vulkan backends are self-contained: they bundle their own loader and
// Mesa driver .so files in lib/ plus the matching ICD manifests in
// vulkan/icd.d/. Point the loader at those manifests so it doesn't rely on
// the runtime base image shipping a Vulkan driver (it carries the
// SYCL/Level-Zero stack instead, so the default ICD search path is empty
// and the GPU would silently fall back to CPU). No-op for other backends.
env = append(env, vulkanICDEnv(workDir)...)
grpcControlProcess := process.New(
process.WithTemporaryStateDir(),
process.WithName(filepath.Base(grpcProcess)),
process.WithArgs(append(args, []string{"--addr", serverAddress}...)...),
process.WithEnvironment(os.Environ()...),
process.WithEnvironment(env...),
process.WithWorkDir(workDir),
)
@@ -249,3 +258,38 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
return grpcControlProcess, nil
}
// vulkanICDEnv returns environment overrides that point the Vulkan loader at
// the ICD manifests a backend bundles in <workDir>/vulkan/icd.d. Vulkan
// backends ship a self-contained stack — their own loader and Mesa driver .so
// files in lib/ (resolved via the LD_LIBRARY_PATH that run.sh sets) plus the
// matching ICD manifests — so the loader must be told where those manifests
// live; its default search path (/usr/share/vulkan/icd.d, /etc/vulkan/icd.d)
// is empty on the runtime base image. Returns nil when the directory holds no
// manifests (CPU/CUDA/SYCL builds), leaving the host's Vulkan setup untouched.
func vulkanICDEnv(workDir string) []string {
icdDir := filepath.Join(workDir, "vulkan", "icd.d")
entries, err := os.ReadDir(icdDir)
if err != nil {
return nil
}
manifests := make([]string, 0, len(entries))
for _, e := range entries {
if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
continue
}
manifests = append(manifests, filepath.Join(icdDir, e.Name()))
}
if len(manifests) == 0 {
return nil
}
list := strings.Join(manifests, string(os.PathListSeparator))
// VK_DRIVER_FILES is the current loader variable; VK_ICD_FILENAMES is its
// deprecated alias, set too so older bundled loaders still pick it up.
return []string{
"VK_DRIVER_FILES=" + list,
"VK_ICD_FILENAMES=" + list,
}
}

View File

@@ -0,0 +1,58 @@
package model
import (
"os"
"path/filepath"
"strings"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("vulkanICDEnv", func() {
It("returns nil when the backend ships no vulkan/icd.d (CPU/CUDA/SYCL builds)", func() {
Expect(vulkanICDEnv(GinkgoT().TempDir())).To(BeNil())
})
It("returns nil when icd.d exists but holds no .json manifests", func() {
work := GinkgoT().TempDir()
icdDir := filepath.Join(work, "vulkan", "icd.d")
Expect(os.MkdirAll(icdDir, 0o755)).To(Succeed())
Expect(os.WriteFile(filepath.Join(icdDir, "README.txt"), []byte("not a manifest"), 0o644)).To(Succeed())
// A directory whose name ends in .json must be ignored.
Expect(os.MkdirAll(filepath.Join(icdDir, "nested.json"), 0o755)).To(Succeed())
Expect(vulkanICDEnv(work)).To(BeNil())
})
It("points VK_DRIVER_FILES/VK_ICD_FILENAMES at the bundled manifests", func() {
work := GinkgoT().TempDir()
icdDir := filepath.Join(work, "vulkan", "icd.d")
Expect(os.MkdirAll(icdDir, 0o755)).To(Succeed())
for _, name := range []string{"intel_icd.json", "lvp_icd.json"} {
Expect(os.WriteFile(filepath.Join(icdDir, name), []byte("{}"), 0o644)).To(Succeed())
}
env := vulkanICDEnv(work)
Expect(env).To(HaveLen(2))
got := map[string]string{}
for _, kv := range env {
k, v, ok := strings.Cut(kv, "=")
Expect(ok).To(BeTrue(), "malformed env entry %q", kv)
got[k] = v
}
for _, key := range []string{"VK_DRIVER_FILES", "VK_ICD_FILENAMES"} {
Expect(got).To(HaveKey(key))
// Both manifests must be listed as absolute paths, joined by the
// OS path-list separator the Vulkan loader expects.
parts := strings.Split(got[key], string(os.PathListSeparator))
Expect(parts).To(HaveLen(2))
for _, p := range parts {
Expect(filepath.IsAbs(p)).To(BeTrue(), "%s entry %q must be absolute", key, p)
Expect(p).To(HaveSuffix(".json"))
}
}
})
})

View File

@@ -109,6 +109,38 @@ copy_libs_glob() {
done
}
# Returns success for the core runtime libs the base image and package.sh
# already provide. We must NOT bundle our own copies of these — a second libc
# or libstdc++ on LD_LIBRARY_PATH clashes with the loader and the rest of the
# process — so they're skipped when pulling in a driver's transitive deps.
is_core_lib() {
case "$1" in
ld-linux*|ld.so|libc.so.*|libm.so.*|libdl.so.*|libpthread.so.*|librt.so.*|\
libgcc_s.so.*|libstdc++.so.*|libresolv.so.*|libutil.so.*|linux-vdso.so.*)
return 0 ;;
esac
return 1
}
# Copy the shared-library dependencies of an ELF file into TARGET_LIB_DIR.
# Used to make a bundled GPU driver self-contained: e.g. the Mesa Vulkan ICDs
# pull in libdrm, libexpat and (for RADV/lavapipe) libLLVM, none of which the
# runtime base image is guaranteed to have. Core libc-family deps are skipped.
copy_elf_deps() {
local elf="$1"
[ -e "$elf" ] || return 0
command -v ldd >/dev/null 2>&1 || return 0
# ldd lines look like: "<TAB>libfoo.so.1 => /path/to/libfoo.so.1 (0x..)".
# Take the resolved absolute path (field 3) and skip vdso/static entries.
while read -r dep; do
if is_core_lib "$(basename "$dep")"; then
continue
fi
copy_lib "$dep"
done < <(ldd "$elf" 2>/dev/null | awk '/=>/ && $3 ~ /^\// {print $3}')
}
# Package NVIDIA CUDA libraries
package_cuda_libs() {
echo "Packaging CUDA libraries for BUILD_TYPE=${BUILD_TYPE}..."
@@ -284,7 +316,7 @@ package_vulkan_libs() {
"/usr/local/lib"
)
# Core Vulkan runtime libraries
# Core Vulkan runtime: the loader plus the shader tooling shipped by the SDK.
local vulkan_libs=(
"libvulkan.so*"
"libshaderc_shared.so*"
@@ -301,10 +333,63 @@ package_vulkan_libs() {
fi
done
# Copy Vulkan ICD files
# Bundle the ICD drivers. Rather than hard-code Mesa's (platform- and
# version-dependent) driver sonames, treat each installed ICD manifest as
# the source of truth: every /usr/share/vulkan/icd.d/*.json names the exact
# driver .so it needs in its "library_path". So we copy whatever drivers
# the manifests reference (libvulkan_intel/radeon/lvp/... on amd64, the SoC
# drivers on arm64, ...) plus each driver's transitive deps, and skip any
# manifest whose driver isn't actually installed. The loader picks the
# right driver for the GPU at runtime.
if [ -d "/usr/share/vulkan/icd.d" ]; then
mkdir -p "$TARGET_LIB_DIR/../vulkan/icd.d"
cp -arfL /usr/share/vulkan/icd.d/* "$TARGET_LIB_DIR/../vulkan/icd.d/" 2>/dev/null || true
local icd_dest="$TARGET_LIB_DIR/../vulkan/icd.d"
mkdir -p "$icd_dest"
local manifest driver driver_base resolved lib_path
for manifest in /usr/share/vulkan/icd.d/*.json; do
[ -e "$manifest" ] || continue
# Pull the driver path out of "library_path": "<path-or-soname>".
driver=$(sed -nE 's/.*"library_path"[[:space:]]*:[[:space:]]*"([^"]+)".*/\1/p' "$manifest" | head -n1)
[ -n "$driver" ] || continue
driver_base=$(basename "$driver")
# Resolve to an absolute path: honour an absolute library_path,
# else look in the standard lib dirs, else fall back to ldconfig.
resolved=""
case "$driver" in
/*) [ -e "$driver" ] && resolved="$driver" ;;
esac
if [ -z "$resolved" ]; then
for lib_path in "${vulkan_lib_paths[@]}"; do
if [ -e "${lib_path}/${driver_base}" ]; then
resolved="${lib_path}/${driver_base}"
break
fi
done
fi
if [ -z "$resolved" ] && command -v ldconfig >/dev/null 2>&1; then
resolved=$(ldconfig -p | awk -v n="$driver_base" '$1 == n { print $NF; exit }')
fi
if [ -z "$resolved" ] || [ ! -e "$resolved" ]; then
echo "Vulkan ICD: driver '$driver_base' for $(basename "$manifest") not installed; skipping its manifest" >&2
continue
fi
# Bundle the driver + its transitive deps (libdrm, libexpat, and
# libLLVM for RADV/lavapipe, ...) so the backend is self-contained
# on a runtime base image without Mesa.
copy_lib "$resolved"
copy_elf_deps "$resolved"
# Copy the manifest and rewrite its library_path to a bare soname
# so the loader resolves our bundled driver via LD_LIBRARY_PATH
# (run.sh adds lib/ to it) instead of a host path that won't exist
# on the runtime image.
cp -arfL "$manifest" "$icd_dest/" 2>/dev/null || true
sed -i -E 's#("library_path"[[:space:]]*:[[:space:]]*")[^"]*/#\1#' "$icd_dest/$(basename "$manifest")"
done
fi
echo "Vulkan libraries packaged successfully"
@@ -345,6 +430,8 @@ package_gpu_libs() {
export -f package_gpu_libs
export -f copy_lib
export -f copy_libs_glob
export -f is_core_lib
export -f copy_elf_deps
export -f package_cuda_libs
export -f package_rocm_libs
export -f package_intel_libs