Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions cmd/nvidia-cdi-hook/cudacompat/cudacompat.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import (
)

const (
cudaCompatPath = "/usr/local/cuda/compat"
defaultCudaCompatPath = "/usr/local/cuda/compat"
// cudaCompatLdsoconfdFilenamePattern specifies the pattern for the filename
// in ld.so.conf.d that includes a reference to the CUDA compat path.
// The 00-compat prefix is chosen to ensure that these libraries have a
Expand All @@ -44,8 +44,11 @@ type command struct {
}

type options struct {
hostDriverVersion string
containerSpec string
cudaCompatContainerRoot string
hostDriverVersion string
// containerSpec allows the path to the container spec to be specified for
// testing.
containerSpec string
}

// NewCommand constructs a cuda-compat command with the specified logger
Expand Down Expand Up @@ -76,6 +79,12 @@ func (m command) build() *cli.Command {
Usage: "Specify the host driver version. If the CUDA compat libraries detected in the container do not have a higher MAJOR version, the hook is a no-op.",
Destination: &cfg.hostDriverVersion,
},
&cli.StringFlag{
Name: "cuda-compat-container-root",
Usage: "Specify the folder in which CUDA compat libraries are located in the container",
Value: defaultCudaCompatPath,
Destination: &cfg.cudaCompatContainerRoot,
},
&cli.StringFlag{
Name: "container-spec",
Hidden: true,
Expand Down Expand Up @@ -108,7 +117,7 @@ func (m command) run(_ *cli.Command, cfg *options) error {
return fmt.Errorf("failed to determined container root: %w", err)
}

containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.hostDriverVersion)
containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.cudaCompatContainerRoot, cfg.hostDriverVersion)
if err != nil {
return fmt.Errorf("failed to get container forward compat directory: %w", err)
}
Expand All @@ -119,17 +128,17 @@ func (m command) run(_ *cli.Command, cfg *options) error {
return m.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir)
}

func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostDriverVersion string) (string, error) {
func (m command) getContainerForwardCompatDir(containerRoot containerRoot, cudaCompatRoot string, hostDriverVersion string) (string, error) {
if hostDriverVersion == "" {
m.logger.Debugf("Host driver version not specified")
return "", nil
}
if !containerRoot.hasPath(cudaCompatPath) {
if !containerRoot.hasPath(cudaCompatRoot) {
m.logger.Debugf("No CUDA forward compatibility libraries directory in container")
return "", nil
}

libs, err := containerRoot.globFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*"))
libs, err := containerRoot.globFiles(filepath.Join(cudaCompatRoot, "libcuda.so.*.*"))
if err != nil {
m.logger.Warningf("Failed to find CUDA compat library: %w", err)
return "", nil
Expand Down
2 changes: 1 addition & 1 deletion cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ func TestCompatLibs(t *testing.T) {
c := command{
logger: logger,
}
containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), tc.hostDriverVersion)
containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), defaultCudaCompatPath, tc.hostDriverVersion)
require.NoError(t, err)
require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir)
})
Expand Down
5 changes: 4 additions & 1 deletion internal/discover/compat_libs.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ import (

// NewCUDACompatHookDiscoverer creates a discoverer for a enable-cuda-compat hook.
// This hook is responsible for setting up CUDA compatibility in the container and depends on the host driver version.
func NewCUDACompatHookDiscoverer(logger logger.Interface, hookCreator HookCreator, version string) Discover {
func NewCUDACompatHookDiscoverer(logger logger.Interface, hookCreator HookCreator, version string, cudaCompatContainerRoot string) Discover {
var args []string
if version != "" && !strings.Contains(version, "*") {
args = append(args, "--host-driver-version="+version)
}
if cudaCompatContainerRoot != "" {
args = append(args, "--cuda-compat-container-root="+cudaCompatContainerRoot)
}

return hookCreator.Create("enable-cuda-compat", args...)
}
2 changes: 1 addition & 1 deletion internal/modifier/gated.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, dr
return nil, fmt.Errorf("failed to get driver version: %w", err)
}

compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, hookCreator, version)
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, hookCreator, version, "")
// For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook.
if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" {
return compatLibHookDiscoverer, nil
Expand Down
8 changes: 0 additions & 8 deletions internal/platform-support/tegra/tegra.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@ func New(opts ...Option) (discover.Discover, error) {
return nil, fmt.Errorf("failed to create discoverer for mount specs: %v", err)
}

ldcacheUpdateHook, err := discover.NewLDCacheUpdateHook(o.logger, mountSpecDiscoverer, o.hookCreator, o.ldconfigPath)
if err != nil {
return nil, fmt.Errorf("failed to create ldcach update hook discoverer: %v", err)
}

tegraSystemMounts := discover.NewMounts(
o.logger,
lookup.NewFileLocator(lookup.WithLogger(o.logger)),
Expand All @@ -75,9 +70,6 @@ func New(opts ...Option) (discover.Discover, error) {

d := discover.Merge(
mountSpecDiscoverer,
// The ldcacheUpdateHook is added after the mount spec discoverer to
// ensure that the symlinks are included.
ldcacheUpdateHook,
tegraSystemMounts,
)

Expand Down
2 changes: 1 addition & 1 deletion pkg/nvcdi/driver-nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func (l *nvcdilib) NewDriverLibraryDiscoverer(version string, libcudaSoParentDir
)
discoverers = append(discoverers, driverDotSoSymlinksDiscoverer)

cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(l.logger, l.hookCreator, version)
cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(l.logger, l.hookCreator, version, "")
discoverers = append(discoverers, cudaCompatLibHookDiscoverer)

updateLDCache, _ := discover.NewLDCacheUpdateHook(l.logger, libraries, l.hookCreator, l.ldconfigPath)
Expand Down
112 changes: 94 additions & 18 deletions pkg/nvcdi/lib-csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"slices"
"strconv"
"strings"

"tags.cncf.io/container-device-interface/pkg/cdi"
"tags.cncf.io/container-device-interface/specs-go"
Expand Down Expand Up @@ -101,12 +102,12 @@ func (l *csvDeviceGenerator) GetDeviceSpecs() ([]specs.Device, error) {
}
e, err := edits.FromDiscoverer(deviceNodeDiscoverer)
if err != nil {
return nil, fmt.Errorf("failed to create container edits for CSV files: %v", err)
return nil, fmt.Errorf("failed to create container edits for CSV files: %w", err)
}

names, err := l.deviceNamers.GetDeviceNames(l.index, l)
if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err)
return nil, fmt.Errorf("failed to get device name: %w", err)
}
var deviceSpecs []specs.Device
for _, name := range names {
Expand Down Expand Up @@ -157,22 +158,7 @@ func (l *csvDeviceGenerator) deviceNodeDiscoverer() (discover.Discover, error) {
// GetCommonEdits generates a CDI specification that can be used for ANY devices
// These explicitly do not include any device nodes.
func (l *csvlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
mountSpecs := tegra.Transform(
tegra.Transform(
tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...),
tegra.WithoutDeviceNodes(),
),
tegra.IgnoreSymlinkMountSpecsByPattern(l.csvIgnorePatterns...),
)
driverDiscoverer, err := tegra.New(
tegra.WithLogger(l.logger),
tegra.WithDriverRoot(l.driverRoot),
tegra.WithDevRoot(l.devRoot),
tegra.WithHookCreator(l.hookCreator),
tegra.WithLdconfigPath(l.ldconfigPath),
tegra.WithLibrarySearchPaths(l.librarySearchPaths...),
tegra.WithMountSpecs(mountSpecs),
)
driverDiscoverer, err := l.driverDiscoverer()
if err != nil {
return nil, fmt.Errorf("failed to create driver discoverer from CSV files: %w", err)
}
Expand Down Expand Up @@ -321,3 +307,93 @@ func isIntegratedGPU(d nvml.Device) (bool, error) {
}
return pciInfo.Device == 0, nil
}

func (l *csvlib) driverDiscoverer() (discover.Discover, error) {
mountSpecs := tegra.Transform(
tegra.Transform(
tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...),
tegra.WithoutDeviceNodes(),
),
tegra.IgnoreSymlinkMountSpecsByPattern(l.csvIgnorePatterns...),
)
driverDiscoverer, err := tegra.New(
tegra.WithLogger(l.logger),
tegra.WithDriverRoot(l.driverRoot),
tegra.WithDevRoot(l.devRoot),
tegra.WithHookCreator(l.hookCreator),
tegra.WithLdconfigPath(l.ldconfigPath),
tegra.WithLibrarySearchPaths(l.librarySearchPaths...),
tegra.WithMountSpecs(mountSpecs),
)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer from CSV files: %w", err)
}

cudaCompatDiscoverer := l.cudaCompatDiscoverer()

ldcacheUpdateHook, err := discover.NewLDCacheUpdateHook(l.logger, driverDiscoverer, l.hookCreator, l.ldconfigPath)
if err != nil {
return nil, fmt.Errorf("failed to create ldcache update hook discoverer: %w", err)
}

d := discover.Merge(
driverDiscoverer,
cudaCompatDiscoverer,
// The ldcacheUpdateHook is added last to ensure that the created symlinks are included
ldcacheUpdateHook,
)
return d, nil
}

// cudaCompatDiscoverer returns a discoverer for the CUDA forward compat hook
// on Tegra-based systems.
// If the system has NVML available, this is used to determine the driver
// version to be passed to the hook.
// On Orin-based systems, the compat library root in the container is also set.
func (l *csvlib) cudaCompatDiscoverer() discover.Discover {
hasNvml, _ := l.infolib.HasNvml()
if !hasNvml {
return nil
}

ret := l.nvmllib.Init()
if ret != nvml.SUCCESS {
l.logger.Warningf("Failed to initialize NVML: %v", ret)
return nil
}
defer func() {
_ = l.nvmllib.Shutdown()
}()

version, ret := l.nvmllib.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
l.logger.Warningf("Failed to get driver version: %v", ret)
return nil
}

var names []string
err := l.devicelib.VisitDevices(func(i int, d device.Device) error {
name, ret := d.GetName()
if ret != nvml.SUCCESS {
return fmt.Errorf("device %v: %v", i, ret)
}
names = append(names, name)
return nil
})
if err != nil {
l.logger.Warningf("Failed to get device names: %v", err)
return nil
}

var cudaCompatContainerRoot string
for _, name := range names {
// TODO: Should this be overridable through a feature flag / config option?
if strings.Contains(name, "Orin (nvgpu)") {
// TODO: This should probably be a constant or configurable.
cudaCompatContainerRoot = "/usr/local/cuda/compat-orin"
break
}
}

return discover.NewCUDACompatHookDiscoverer(l.logger, l.hookCreator, version, cudaCompatContainerRoot)
}