Skip to content

Commit c8056eb

Browse files
committed
Add enable-cuda-compat on Tegra-based systems
This change ensures that tegra-based systems also include a enable-cuda-compat hook. In the case of Orin devices, this compat root in the container is set to /usr/local/cuda/compat-orin to allow these compat libraries to be selected (if present in the container). Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent 6ea6955 commit c8056eb

File tree

2 files changed

+94
-26
lines changed

2 files changed

+94
-26
lines changed

internal/platform-support/tegra/tegra.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,6 @@ func New(opts ...Option) (discover.Discover, error) {
5959
return nil, fmt.Errorf("failed to create discoverer for mount specs: %v", err)
6060
}
6161

62-
ldcacheUpdateHook, err := discover.NewLDCacheUpdateHook(o.logger, mountSpecDiscoverer, o.hookCreator, o.ldconfigPath)
63-
if err != nil {
64-
return nil, fmt.Errorf("failed to create ldcach update hook discoverer: %v", err)
65-
}
66-
6762
tegraSystemMounts := discover.NewMounts(
6863
o.logger,
6964
lookup.NewFileLocator(lookup.WithLogger(o.logger)),
@@ -75,9 +70,6 @@ func New(opts ...Option) (discover.Discover, error) {
7570

7671
d := discover.Merge(
7772
mountSpecDiscoverer,
78-
// The ldcacheUpdateHook is added after the mount spec discoverer to
79-
// ensure that the symlinks are included.
80-
ldcacheUpdateHook,
8173
tegraSystemMounts,
8274
)
8375

pkg/nvcdi/lib-csv.go

Lines changed: 94 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"fmt"
2121
"slices"
2222
"strconv"
23+
"strings"
2324

2425
"tags.cncf.io/container-device-interface/pkg/cdi"
2526
"tags.cncf.io/container-device-interface/specs-go"
@@ -101,12 +102,12 @@ func (l *csvDeviceGenerator) GetDeviceSpecs() ([]specs.Device, error) {
101102
}
102103
e, err := edits.FromDiscoverer(deviceNodeDiscoverer)
103104
if err != nil {
104-
return nil, fmt.Errorf("failed to create container edits for CSV files: %v", err)
105+
return nil, fmt.Errorf("failed to create container edits for CSV files: %w", err)
105106
}
106107

107108
names, err := l.deviceNamers.GetDeviceNames(l.index, l)
108109
if err != nil {
109-
return nil, fmt.Errorf("failed to get device name: %v", err)
110+
return nil, fmt.Errorf("failed to get device name: %w", err)
110111
}
111112
var deviceSpecs []specs.Device
112113
for _, name := range names {
@@ -157,22 +158,7 @@ func (l *csvDeviceGenerator) deviceNodeDiscoverer() (discover.Discover, error) {
157158
// GetCommonEdits generates a CDI specification that can be used for ANY devices
158159
// These explicitly do not include any device nodes.
159160
func (l *csvlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
160-
mountSpecs := tegra.Transform(
161-
tegra.Transform(
162-
tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...),
163-
tegra.WithoutDeviceNodes(),
164-
),
165-
tegra.IgnoreSymlinkMountSpecsByPattern(l.csvIgnorePatterns...),
166-
)
167-
driverDiscoverer, err := tegra.New(
168-
tegra.WithLogger(l.logger),
169-
tegra.WithDriverRoot(l.driverRoot),
170-
tegra.WithDevRoot(l.devRoot),
171-
tegra.WithHookCreator(l.hookCreator),
172-
tegra.WithLdconfigPath(l.ldconfigPath),
173-
tegra.WithLibrarySearchPaths(l.librarySearchPaths...),
174-
tegra.WithMountSpecs(mountSpecs),
175-
)
161+
driverDiscoverer, err := l.driverDiscoverer()
176162
if err != nil {
177163
return nil, fmt.Errorf("failed to create driver discoverer from CSV files: %w", err)
178164
}
@@ -321,3 +307,93 @@ func isIntegratedGPU(d nvml.Device) (bool, error) {
321307
}
322308
return pciInfo.Device == 0, nil
323309
}
310+
311+
func (l *csvlib) driverDiscoverer() (discover.Discover, error) {
312+
mountSpecs := tegra.Transform(
313+
tegra.Transform(
314+
tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...),
315+
tegra.WithoutDeviceNodes(),
316+
),
317+
tegra.IgnoreSymlinkMountSpecsByPattern(l.csvIgnorePatterns...),
318+
)
319+
driverDiscoverer, err := tegra.New(
320+
tegra.WithLogger(l.logger),
321+
tegra.WithDriverRoot(l.driverRoot),
322+
tegra.WithDevRoot(l.devRoot),
323+
tegra.WithHookCreator(l.hookCreator),
324+
tegra.WithLdconfigPath(l.ldconfigPath),
325+
tegra.WithLibrarySearchPaths(l.librarySearchPaths...),
326+
tegra.WithMountSpecs(mountSpecs),
327+
)
328+
if err != nil {
329+
return nil, fmt.Errorf("failed to create discoverer from CSV files: %w", err)
330+
}
331+
332+
cudaCompatDiscoverer := l.cudaCompatDiscoverer()
333+
334+
ldcacheUpdateHook, err := discover.NewLDCacheUpdateHook(l.logger, driverDiscoverer, l.hookCreator, l.ldconfigPath)
335+
if err != nil {
336+
return nil, fmt.Errorf("failed to create ldcache update hook discoverer: %w", err)
337+
}
338+
339+
d := discover.Merge(
340+
driverDiscoverer,
341+
cudaCompatDiscoverer,
342+
// The ldcacheUpdateHook is added last to ensure that the created symlinks are included
343+
ldcacheUpdateHook,
344+
)
345+
return d, nil
346+
}
347+
348+
// cudaCompatDiscoverer returns a discoverer for the CUDA forward compat hook
349+
// on Tegra-based systems.
350+
// If the system has NVML available, this is used to determine the driver
351+
// version to be passed to the hook.
352+
// On Orin-based systems, the compat library root in the container is also set.
353+
func (l *csvlib) cudaCompatDiscoverer() discover.Discover {
354+
hasNvml, _ := l.infolib.HasNvml()
355+
if !hasNvml {
356+
return nil
357+
}
358+
359+
ret := l.nvmllib.Init()
360+
if ret != nvml.SUCCESS {
361+
l.logger.Warningf("Failed to initialize NVML: %v", ret)
362+
return nil
363+
}
364+
defer func() {
365+
_ = l.nvmllib.Shutdown()
366+
}()
367+
368+
version, ret := l.nvmllib.SystemGetDriverVersion()
369+
if ret != nvml.SUCCESS {
370+
l.logger.Warningf("Failed to get driver version: %v", ret)
371+
return nil
372+
}
373+
374+
var names []string
375+
err := l.devicelib.VisitDevices(func(i int, d device.Device) error {
376+
name, ret := d.GetName()
377+
if ret != nvml.SUCCESS {
378+
return fmt.Errorf("device %v: %v", i, ret)
379+
}
380+
names = append(names, name)
381+
return nil
382+
})
383+
if err != nil {
384+
l.logger.Warningf("Failed to get device names: %v", err)
385+
return nil
386+
}
387+
388+
var cudaCompatContainerRoot string
389+
for _, name := range names {
390+
// TODO: Should this be overridable through a feature flag / config option?
391+
if strings.Contains(name, "Orin (nvgpu)") {
392+
// TODO: This should probably be a constant or configurable.
393+
cudaCompatContainerRoot = "/usr/local/cuda/compat-orin"
394+
break
395+
}
396+
}
397+
398+
return discover.NewCUDACompatHookDiscoverer(l.logger, l.hookCreator, version, cudaCompatContainerRoot)
399+
}

0 commit comments

Comments
 (0)