fix: gate CUDA directory checks on GPU vendor to prevent false CUDA detection (#8942)

Container images that install CUDA runtime libraries (e.g., cuda-cudart-12-5
via apt) create /usr/local/cuda-12 directories as a side effect. The previous
code checked for these directories before checking whether a GPU was present,
causing CPU-only hosts to select a CUDA backend that crashes because
libcuda.so.1 is absent.

Reorder checks so CUDA directory existence only refines the capability when
an NVIDIA GPU is actually detected, consistent with the arm64 L4T code path.

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>
This commit is contained in:
Sertaç Özercan
2026-03-11 23:53:39 -07:00
committed by GitHub
parent 7dc691c171
commit 45d18813bd
3 changed files with 156 additions and 11 deletions

View File

@@ -132,29 +132,32 @@ func (s *SystemState) getSystemCapabilities() string {
}
}
if cuda13DirExists {
s.systemCapabilities = nvidiaCuda13
return s.systemCapabilities
}
if cuda12DirExists {
s.systemCapabilities = nvidiaCuda12
return s.systemCapabilities
}
// No GPU detected → default capability
if s.GPUVendor == "" {
xlog.Info("Default capability (no GPU detected)", "env", capabilityEnv)
s.systemCapabilities = defaultCapability
return s.systemCapabilities
}
// If vram is less than 4GB, let's default to CPU but warn the user that they can override that via env
// GPU detected but insufficient VRAM → default with warning
if s.VRAM <= 4*1024*1024*1024 {
xlog.Warn("VRAM is less than 4GB, defaulting to CPU", "env", capabilityEnv)
s.systemCapabilities = defaultCapability
return s.systemCapabilities
}
// CUDA directories refine capability only for NVIDIA GPUs
if s.GPUVendor == Nvidia {
if cuda13DirExists {
s.systemCapabilities = nvidiaCuda13
return s.systemCapabilities
}
if cuda12DirExists {
s.systemCapabilities = nvidiaCuda12
return s.systemCapabilities
}
}
s.systemCapabilities = s.GPUVendor
return s.systemCapabilities
}

View File

@@ -0,0 +1,129 @@
package system
import (
"os"
"runtime"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("getSystemCapabilities", func() {
const eightGB = 8 * 1024 * 1024 * 1024
const twoGB = 2 * 1024 * 1024 * 1024
var (
origEnv string
origCuda12 bool
origCuda13 bool
)
BeforeEach(func() {
if runtime.GOOS == "darwin" {
Skip("darwin short-circuits before reaching CUDA logic")
}
origEnv = os.Getenv(capabilityEnv)
os.Unsetenv(capabilityEnv)
origCuda12 = cuda12DirExists
origCuda13 = cuda13DirExists
})
AfterEach(func() {
cuda12DirExists = origCuda12
cuda13DirExists = origCuda13
if origEnv != "" {
os.Setenv(capabilityEnv, origEnv)
}
})
type testCase struct {
gpuVendor string
vram uint64
cuda12 bool
cuda13 bool
wantCapability string
wantTokens []string
}
DescribeTable("capability detection",
func(tc testCase) {
cuda12DirExists = tc.cuda12
cuda13DirExists = tc.cuda13
s := &SystemState{
GPUVendor: tc.gpuVendor,
VRAM: tc.vram,
}
Expect(s.getSystemCapabilities()).To(Equal(tc.wantCapability))
Expect(s.BackendPreferenceTokens()).To(Equal(tc.wantTokens))
},
Entry("CUDA dir present but no GPU", testCase{
gpuVendor: "",
vram: 0,
cuda12: true,
cuda13: false,
wantCapability: "default",
wantTokens: []string{"cpu"},
}),
Entry("CUDA 12 with NVIDIA GPU", testCase{
gpuVendor: Nvidia,
vram: eightGB,
cuda12: true,
cuda13: false,
wantCapability: "nvidia-cuda-12",
wantTokens: []string{"cuda", "vulkan", "cpu"},
}),
Entry("CUDA 13 with NVIDIA GPU", testCase{
gpuVendor: Nvidia,
vram: eightGB,
cuda12: false,
cuda13: true,
wantCapability: "nvidia-cuda-13",
wantTokens: []string{"cuda", "vulkan", "cpu"},
}),
Entry("Both CUDA dirs with NVIDIA GPU prefers 13", testCase{
gpuVendor: Nvidia,
vram: eightGB,
cuda12: true,
cuda13: true,
wantCapability: "nvidia-cuda-13",
wantTokens: []string{"cuda", "vulkan", "cpu"},
}),
Entry("CUDA dir with AMD GPU ignored", testCase{
gpuVendor: AMD,
vram: eightGB,
cuda12: true,
cuda13: false,
wantCapability: "amd",
wantTokens: []string{"rocm", "hip", "vulkan", "cpu"},
}),
Entry("No CUDA dir and no GPU", testCase{
gpuVendor: "",
vram: 0,
cuda12: false,
cuda13: false,
wantCapability: "default",
wantTokens: []string{"cpu"},
}),
Entry("No CUDA dir with NVIDIA GPU", testCase{
gpuVendor: Nvidia,
vram: eightGB,
cuda12: false,
cuda13: false,
wantCapability: "nvidia",
wantTokens: []string{"cuda", "vulkan", "cpu"},
}),
Entry("CUDA dir with NVIDIA GPU but low VRAM", testCase{
gpuVendor: Nvidia,
vram: twoGB,
cuda12: true,
cuda13: false,
wantCapability: "default",
wantTokens: []string{"cpu"},
}),
)
})

View File

@@ -0,0 +1,13 @@
package system
import (
"testing"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func TestSystem(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "System test suite")
}