diff --git a/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go b/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go index 94b5c945f..43992a6b4 100644 --- a/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go +++ b/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go @@ -31,7 +31,7 @@ import ( ) const ( - cudaCompatPath = "/usr/local/cuda/compat" + defaultCudaCompatPath = "/usr/local/cuda/compat" // cudaCompatLdsoconfdFilenamePattern specifies the pattern for the filename // in ld.so.conf.d that includes a reference to the CUDA compat path. // The 00-compat prefix is chosen to ensure that these libraries have a @@ -44,8 +44,11 @@ type command struct { } type options struct { - hostDriverVersion string - containerSpec string + cudaCompatContainerRoot string + hostDriverVersion string + // containerSpec allows the path to the container spec to be specified for + // testing. + containerSpec string } // NewCommand constructs a cuda-compat command with the specified logger @@ -76,6 +79,12 @@ func (m command) build() *cli.Command { Usage: "Specify the host driver version. If the CUDA compat libraries detected in the container do not have a higher MAJOR version, the hook is a no-op.", Destination: &cfg.hostDriverVersion, }, + &cli.StringFlag{ + Name: "cuda-compat-container-root", + Usage: "Specify the folder in which CUDA compat libraries are located in the container", + Value: defaultCudaCompatPath, + Destination: &cfg.cudaCompatContainerRoot, + }, &cli.StringFlag{ Name: "container-spec", Hidden: true, @@ -108,7 +117,7 @@ func (m command) run(_ *cli.Command, cfg *options) error { return fmt.Errorf("failed to determined container root: %w", err) } - containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.hostDriverVersion) + containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.cudaCompatContainerRoot, cfg.hostDriverVersion) if err != nil { return fmt.Errorf("failed to get container forward compat directory: %w", err) } @@ -119,17 +128,17 @@ func (m command) run(_ *cli.Command, cfg *options) error { return m.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir) } -func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostDriverVersion string) (string, error) { +func (m command) getContainerForwardCompatDir(containerRoot containerRoot, cudaCompatRoot string, hostDriverVersion string) (string, error) { if hostDriverVersion == "" { m.logger.Debugf("Host driver version not specified") return "", nil } - if !containerRoot.hasPath(cudaCompatPath) { + if !containerRoot.hasPath(cudaCompatRoot) { m.logger.Debugf("No CUDA forward compatibility libraries directory in container") return "", nil } - libs, err := containerRoot.globFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*")) + libs, err := containerRoot.globFiles(filepath.Join(cudaCompatRoot, "libcuda.so.*.*")) if err != nil { m.logger.Warningf("Failed to find CUDA compat library: %w", err) return "", nil diff --git a/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go b/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go index 31fc2e085..937773698 100644 --- a/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go +++ b/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go @@ -131,7 +131,7 @@ func TestCompatLibs(t *testing.T) { c := command{ logger: logger, } - containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), tc.hostDriverVersion) + containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), defaultCudaCompatPath, tc.hostDriverVersion) require.NoError(t, err) require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir) }) diff --git a/internal/discover/compat_libs.go b/internal/discover/compat_libs.go index 71926a9f6..977fdf189 100644 --- a/internal/discover/compat_libs.go +++ b/internal/discover/compat_libs.go @@ -8,11 +8,14 @@ import ( // NewCUDACompatHookDiscoverer creates a discoverer for a enable-cuda-compat hook. // This hook is responsible for setting up CUDA compatibility in the container and depends on the host driver version. -func NewCUDACompatHookDiscoverer(logger logger.Interface, hookCreator HookCreator, version string) Discover { +func NewCUDACompatHookDiscoverer(logger logger.Interface, hookCreator HookCreator, version string, cudaCompatContainerRoot string) Discover { var args []string if version != "" && !strings.Contains(version, "*") { args = append(args, "--host-driver-version="+version) } + if cudaCompatContainerRoot != "" { + args = append(args, "--cuda-compat-container-root="+cudaCompatContainerRoot) + } return hookCreator.Create("enable-cuda-compat", args...) } diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index f61e6cd87..84b01ee4b 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -107,7 +107,7 @@ func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, dr return nil, fmt.Errorf("failed to get driver version: %w", err) } - compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, hookCreator, version) + compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, hookCreator, version, "") // For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook. if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" { return compatLibHookDiscoverer, nil diff --git a/internal/platform-support/tegra/tegra.go b/internal/platform-support/tegra/tegra.go index b31a98a3d..982bdad3b 100644 --- a/internal/platform-support/tegra/tegra.go +++ b/internal/platform-support/tegra/tegra.go @@ -59,11 +59,6 @@ func New(opts ...Option) (discover.Discover, error) { return nil, fmt.Errorf("failed to create discoverer for mount specs: %v", err) } - ldcacheUpdateHook, err := discover.NewLDCacheUpdateHook(o.logger, mountSpecDiscoverer, o.hookCreator, o.ldconfigPath) - if err != nil { - return nil, fmt.Errorf("failed to create ldcach update hook discoverer: %v", err) - } - tegraSystemMounts := discover.NewMounts( o.logger, lookup.NewFileLocator(lookup.WithLogger(o.logger)), @@ -75,9 +70,6 @@ func New(opts ...Option) (discover.Discover, error) { d := discover.Merge( mountSpecDiscoverer, - // The ldcacheUpdateHook is added after the mount spec discoverer to - // ensure that the symlinks are included. - ldcacheUpdateHook, tegraSystemMounts, ) diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index 191cc6a91..168275993 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -101,7 +101,7 @@ func (l *nvcdilib) NewDriverLibraryDiscoverer(version string, libcudaSoParentDir ) discoverers = append(discoverers, driverDotSoSymlinksDiscoverer) - cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(l.logger, l.hookCreator, version) + cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(l.logger, l.hookCreator, version, "") discoverers = append(discoverers, cudaCompatLibHookDiscoverer) updateLDCache, _ := discover.NewLDCacheUpdateHook(l.logger, libraries, l.hookCreator, l.ldconfigPath) diff --git a/pkg/nvcdi/lib-csv.go b/pkg/nvcdi/lib-csv.go index f6493bd9a..77fcd95b3 100644 --- a/pkg/nvcdi/lib-csv.go +++ b/pkg/nvcdi/lib-csv.go @@ -20,6 +20,7 @@ import ( "fmt" "slices" "strconv" + "strings" "tags.cncf.io/container-device-interface/pkg/cdi" "tags.cncf.io/container-device-interface/specs-go" @@ -101,12 +102,12 @@ func (l *csvDeviceGenerator) GetDeviceSpecs() ([]specs.Device, error) { } e, err := edits.FromDiscoverer(deviceNodeDiscoverer) if err != nil { - return nil, fmt.Errorf("failed to create container edits for CSV files: %v", err) + return nil, fmt.Errorf("failed to create container edits for CSV files: %w", err) } names, err := l.deviceNamers.GetDeviceNames(l.index, l) if err != nil { - return nil, fmt.Errorf("failed to get device name: %v", err) + return nil, fmt.Errorf("failed to get device name: %w", err) } var deviceSpecs []specs.Device for _, name := range names { @@ -157,22 +158,7 @@ func (l *csvDeviceGenerator) deviceNodeDiscoverer() (discover.Discover, error) { // GetCommonEdits generates a CDI specification that can be used for ANY devices // These explicitly do not include any device nodes. func (l *csvlib) GetCommonEdits() (*cdi.ContainerEdits, error) { - mountSpecs := tegra.Transform( - tegra.Transform( - tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...), - tegra.WithoutDeviceNodes(), - ), - tegra.IgnoreSymlinkMountSpecsByPattern(l.csvIgnorePatterns...), - ) - driverDiscoverer, err := tegra.New( - tegra.WithLogger(l.logger), - tegra.WithDriverRoot(l.driverRoot), - tegra.WithDevRoot(l.devRoot), - tegra.WithHookCreator(l.hookCreator), - tegra.WithLdconfigPath(l.ldconfigPath), - tegra.WithLibrarySearchPaths(l.librarySearchPaths...), - tegra.WithMountSpecs(mountSpecs), - ) + driverDiscoverer, err := l.driverDiscoverer() if err != nil { return nil, fmt.Errorf("failed to create driver discoverer from CSV files: %w", err) } @@ -321,3 +307,93 @@ func isIntegratedGPU(d nvml.Device) (bool, error) { } return pciInfo.Device == 0, nil } + +func (l *csvlib) driverDiscoverer() (discover.Discover, error) { + mountSpecs := tegra.Transform( + tegra.Transform( + tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...), + tegra.WithoutDeviceNodes(), + ), + tegra.IgnoreSymlinkMountSpecsByPattern(l.csvIgnorePatterns...), + ) + driverDiscoverer, err := tegra.New( + tegra.WithLogger(l.logger), + tegra.WithDriverRoot(l.driverRoot), + tegra.WithDevRoot(l.devRoot), + tegra.WithHookCreator(l.hookCreator), + tegra.WithLdconfigPath(l.ldconfigPath), + tegra.WithLibrarySearchPaths(l.librarySearchPaths...), + tegra.WithMountSpecs(mountSpecs), + ) + if err != nil { + return nil, fmt.Errorf("failed to create discoverer from CSV files: %w", err) + } + + cudaCompatDiscoverer := l.cudaCompatDiscoverer() + + ldcacheUpdateHook, err := discover.NewLDCacheUpdateHook(l.logger, driverDiscoverer, l.hookCreator, l.ldconfigPath) + if err != nil { + return nil, fmt.Errorf("failed to create ldcache update hook discoverer: %w", err) + } + + d := discover.Merge( + driverDiscoverer, + cudaCompatDiscoverer, + // The ldcacheUpdateHook is added last to ensure that the created symlinks are included + ldcacheUpdateHook, + ) + return d, nil +} + +// cudaCompatDiscoverer returns a discoverer for the CUDA forward compat hook +// on Tegra-based systems. +// If the system has NVML available, this is used to determine the driver +// version to be passed to the hook. +// On Orin-based systems, the compat library root in the container is also set. +func (l *csvlib) cudaCompatDiscoverer() discover.Discover { + hasNvml, _ := l.infolib.HasNvml() + if !hasNvml { + return nil + } + + ret := l.nvmllib.Init() + if ret != nvml.SUCCESS { + l.logger.Warningf("Failed to initialize NVML: %v", ret) + return nil + } + defer func() { + _ = l.nvmllib.Shutdown() + }() + + version, ret := l.nvmllib.SystemGetDriverVersion() + if ret != nvml.SUCCESS { + l.logger.Warningf("Failed to get driver version: %v", ret) + return nil + } + + var names []string + err := l.devicelib.VisitDevices(func(i int, d device.Device) error { + name, ret := d.GetName() + if ret != nvml.SUCCESS { + return fmt.Errorf("device %v: %v", i, ret) + } + names = append(names, name) + return nil + }) + if err != nil { + l.logger.Warningf("Failed to get device names: %v", err) + return nil + } + + var cudaCompatContainerRoot string + for _, name := range names { + // TODO: Should this be overridable through a feature flag / config option? + if strings.Contains(name, "Orin (nvgpu)") { + // TODO: This should probably be a constant or configurable. + cudaCompatContainerRoot = "/usr/local/cuda/compat-orin" + break + } + } + + return discover.NewCUDACompatHookDiscoverer(l.logger, l.hookCreator, version, cudaCompatContainerRoot) +}