diff --git a/go.mod b/go.mod index 554aacfa5..9ebb1bfbe 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/Masterminds/semver v1.5.0 github.com/NVIDIA/go-nvlib v0.7.1 github.com/NVIDIA/go-nvml v0.12.4-1 - github.com/NVIDIA/nvidia-container-toolkit v1.17.1-0.20250206090248-1d0777ee0165 + github.com/NVIDIA/nvidia-container-toolkit v1.17.5-0.20250306104533-1c13a9647cc1 github.com/google/uuid v1.6.0 github.com/prometheus/client_golang v1.19.1 github.com/sirupsen/logrus v1.9.3 @@ -89,3 +89,5 @@ require ( sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect sigs.k8s.io/yaml v1.4.0 // indirect ) + +replace github.com/NVIDIA/nvidia-container-toolkit => ../container-toolkit diff --git a/go.sum b/go.sum index 9fd7020be..c77f86df3 100644 --- a/go.sum +++ b/go.sum @@ -6,8 +6,6 @@ github.com/NVIDIA/go-nvlib v0.7.1 h1:7HHPZxoCjSLm1NgaRRjuhI8ffMCpc5Vgpg5yxQYUff8 github.com/NVIDIA/go-nvlib v0.7.1/go.mod h1:2Kh2kYSP5IJ8EKf0/SYDzHiQKb9EJkwOf2LQzu6pXzY= github.com/NVIDIA/go-nvml v0.12.4-1 h1:WKUvqshhWSNTfm47ETRhv0A0zJyr1ncCuHiXwoTrBEc= github.com/NVIDIA/go-nvml v0.12.4-1/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= -github.com/NVIDIA/nvidia-container-toolkit v1.17.1-0.20250206090248-1d0777ee0165 h1:YYs5V+gwO+qnjLK0yhHGvXGWdbqMOmst/5aJEcIkA1Q= -github.com/NVIDIA/nvidia-container-toolkit v1.17.1-0.20250206090248-1d0777ee0165/go.mod h1:HbdWdB4ukjPWvSIYhNFKthzVuQATUBEXxSskcABWvP0= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/discover/compat_libs.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/discover/compat_libs.go new file mode 100644 index 000000000..027ca2ed2 --- /dev/null +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/discover/compat_libs.go @@ -0,0 +1,24 @@ +package discover + +import ( + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" +) + +// NewCUDACompatHookDiscoverer creates a discoverer for a enable-cuda-compat hook. +// This hook is responsible for setting up CUDA compatibility in the container and depends on the host driver version. +func NewCUDACompatHookDiscoverer(logger logger.Interface, nvidiaCDIHookPath string, driver *root.Driver) Discover { + _, cudaVersionPattern := getCUDALibRootAndVersionPattern(logger, driver) + var args []string + if !strings.Contains(cudaVersionPattern, "*") { + args = append(args, "--host-driver-version="+cudaVersionPattern) + } + + return CreateNvidiaCDIHook( + nvidiaCDIHookPath, + "enable-cuda-compat", + args..., + ) +} diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/device.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/device.go index e4fbe330f..8ec6c4c91 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/device.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/device.go @@ -28,15 +28,9 @@ const ( // NewCharDeviceLocator creates a Locator that can be used to find char devices at the specified root. A logger is // also specified. func NewCharDeviceLocator(opts ...Option) Locator { - filter := assertCharDevice - // TODO: We should have a better way to inject this logic than this envvar. - if os.Getenv("__NVCT_TESTING_DEVICES_ARE_FILES") == "true" { - filter = assertFile - } - opts = append(opts, WithSearchPaths("", devRoot), - WithFilter(filter), + WithFilter(assertCharDevice), ) return NewFileLocator( opts..., diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/api.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/api.go index f1c7b97ac..1c84fefaa 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/api.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/api.go @@ -24,6 +24,24 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" ) +const ( + // ModeAuto configures the CDI spec generator to automatically detect the system configuration + ModeAuto = "auto" + // ModeNvml configures the CDI spec generator to use the NVML library. + ModeNvml = "nvml" + // ModeWsl configures the CDI spec generator to generate a WSL spec. + ModeWsl = "wsl" + // ModeManagement configures the CDI spec generator to generate a management spec. + ModeManagement = "management" + // ModeGds configures the CDI spec generator to generate a GDS spec. + ModeGds = "gds" + // ModeMofed configures the CDI spec generator to generate a MOFED spec. + ModeMofed = "mofed" + // ModeCSV configures the CDI spec generator to generate a spec based on the contents of CSV + // mountspec files. + ModeCSV = "csv" +) + // Interface defines the API for the nvcdi package type Interface interface { GetSpec() (spec.Interface, error) diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/driver-nvml.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/driver-nvml.go index b0006aebf..782d60fcd 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/driver-nvml.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/driver-nvml.go @@ -97,6 +97,8 @@ func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nv libraryPaths, ) + // TODO: The following should use the version directly. + cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, nvidiaCDIHookPath, driver) updateLDCache, _ := discover.NewLDCacheUpdateHook(logger, libraries, nvidiaCDIHookPath, ldconfigPath) d := discover.Merge( @@ -105,6 +107,7 @@ func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nv version, nvidiaCDIHookPath, ), + cudaCompatLibHookDiscoverer, updateLDCache, ) diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib-imex.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib-imex.go deleted file mode 100644 index 3c375d56f..000000000 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib-imex.go +++ /dev/null @@ -1,118 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package nvcdi - -import ( - "fmt" - "path/filepath" - "strconv" - "strings" - - "tags.cncf.io/container-device-interface/pkg/cdi" - "tags.cncf.io/container-device-interface/specs-go" - - "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - - "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" - "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" - "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" -) - -type imexlib nvcdilib - -var _ Interface = (*imexlib)(nil) - -const ( - classImexChannel = "imex-channel" -) - -// GetSpec should not be called for imexlib. -func (l *imexlib) GetSpec() (spec.Interface, error) { - return nil, fmt.Errorf("unexpected call to imexlib.GetSpec()") -} - -// GetAllDeviceSpecs returns the device specs for all available devices. -func (l *imexlib) GetAllDeviceSpecs() ([]specs.Device, error) { - channelsDiscoverer := discover.NewCharDeviceDiscoverer( - l.logger, - l.devRoot, - []string{"/dev/nvidia-caps-imex-channels/channel*"}, - ) - - channels, err := channelsDiscoverer.Devices() - if err != nil { - return nil, err - } - - var channelIDs []string - for _, channel := range channels { - channelIDs = append(channelIDs, filepath.Base(channel.Path)) - } - - return l.GetDeviceSpecsByID(channelIDs...) -} - -// GetCommonEdits returns an empty set of edits for IMEX devices. -func (l *imexlib) GetCommonEdits() (*cdi.ContainerEdits, error) { - return edits.FromDiscoverer(discover.None{}) -} - -// GetDeviceSpecsByID returns the CDI device specs for the IMEX channels specified. -func (l *imexlib) GetDeviceSpecsByID(ids ...string) ([]specs.Device, error) { - var deviceSpecs []specs.Device - for _, id := range ids { - trimmed := strings.TrimPrefix(id, "channel") - _, err := strconv.ParseUint(trimmed, 10, 64) - if err != nil { - return nil, fmt.Errorf("invalid channel ID %v: %w", id, err) - } - path := "/dev/nvidia-caps-imex-channels/channel" + trimmed - deviceSpec := specs.Device{ - Name: trimmed, - ContainerEdits: specs.ContainerEdits{ - DeviceNodes: []*specs.DeviceNode{ - { - Path: path, - HostPath: filepath.Join(l.devRoot, path), - }, - }, - }, - } - deviceSpecs = append(deviceSpecs, deviceSpec) - } - return deviceSpecs, nil -} - -// GetGPUDeviceEdits is unsupported for the imexlib specs -func (l *imexlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) { - return nil, fmt.Errorf("GetGPUDeviceEdits is not supported") -} - -// GetGPUDeviceSpecs is unsupported for the imexlib specs -func (l *imexlib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) { - return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported") -} - -// GetMIGDeviceEdits is unsupported for the imexlib specs -func (l *imexlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) { - return nil, fmt.Errorf("GetMIGDeviceEdits is not supported") -} - -// GetMIGDeviceSpecs is unsupported for the imexlib specs -func (l *imexlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) { - return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported") -} diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib-nvml.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib-nvml.go index c940b090d..01c22ff37 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib-nvml.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib-nvml.go @@ -37,7 +37,7 @@ var _ Interface = (*nvmllib)(nil) // GetSpec should not be called for nvmllib func (l *nvmllib) GetSpec() (spec.Interface, error) { - return nil, fmt.Errorf("unexpected call to nvmllib.GetSpec()") + return nil, fmt.Errorf("Unexpected call to nvmllib.GetSpec()") } // GetAllDeviceSpecs returns the device specs for all available devices. diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib.go index 409cea76b..0e81247d7 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib.go @@ -46,7 +46,7 @@ type nvcdilib struct { logger logger.Interface nvmllib nvml.Interface nvsandboxutilslib nvsandboxutils.Interface - mode Mode + mode string devicelib device.Interface deviceNamers DeviceNamers driverRoot string @@ -111,19 +111,24 @@ func New(opts ...Option) (Interface, error) { } l.nvmllib = nvml.New(nvmlOpts...) } - if l.nvsandboxutilslib == nil { - var nvsandboxutilsOpts []nvsandboxutils.LibraryOption - // Set the library path for libnvidia-sandboxutils - candidates, err := l.driver.Libraries().Locate("libnvidia-sandboxutils.so.1") - if err != nil { - l.logger.Warningf("Ignoring error in locating libnvidia-sandboxutils.so.1: %v", err) - } else { - libNvidiaSandboxutilsPath := candidates[0] - l.logger.Infof("Using %v", libNvidiaSandboxutilsPath) - nvsandboxutilsOpts = append(nvsandboxutilsOpts, nvsandboxutils.WithLibraryPath(libNvidiaSandboxutilsPath)) - } - l.nvsandboxutilslib = nvsandboxutils.New(nvsandboxutilsOpts...) - } + // TODO: Repeated calls to nvsandboxutils.Init and Shutdown are causing + // segmentation violations. Here we disabled nvsandbox utils unless explicitly + // specified. + // This will be reenabled as soon as we have more visibility into why this is + // happening and a mechanism to detect and disable this if required. + // if l.nvsandboxutilslib == nil { + // var nvsandboxutilsOpts []nvsandboxutils.LibraryOption + // // Set the library path for libnvidia-sandboxutils + // candidates, err := l.driver.Libraries().Locate("libnvidia-sandboxutils.so.1") + // if err != nil { + // l.logger.Warningf("Ignoring error in locating libnvidia-sandboxutils.so.1: %v", err) + // } else { + // libNvidiaSandboxutilsPath := candidates[0] + // l.logger.Infof("Using %v", libNvidiaSandboxutilsPath) + // nvsandboxutilsOpts = append(nvsandboxutilsOpts, nvsandboxutils.WithLibraryPath(libNvidiaSandboxutilsPath)) + // } + // l.nvsandboxutilslib = nvsandboxutils.New(nvsandboxutilsOpts...) + // } if l.devicelib == nil { l.devicelib = device.New(l.nvmllib) } @@ -162,11 +167,6 @@ func New(opts ...Option) (Interface, error) { l.class = "mofed" } lib = (*mofedlib)(l) - case ModeImex: - if l.class == "" { - l.class = classImexChannel - } - lib = (*imexlib)(l) default: return nil, fmt.Errorf("unknown mode %q", l.mode) } @@ -212,6 +212,28 @@ func (m *wrapper) GetCommonEdits() (*cdi.ContainerEdits, error) { return edits, nil } +// resolveMode resolves the mode for CDI spec generation based on the current system. +func (l *nvcdilib) resolveMode() (rmode string) { + if l.mode != ModeAuto { + return l.mode + } + defer func() { + l.logger.Infof("Auto-detected mode as '%v'", rmode) + }() + + platform := l.infolib.ResolvePlatform() + switch platform { + case info.PlatformNVML: + return ModeNvml + case info.PlatformTegra: + return ModeCSV + case info.PlatformWSL: + return ModeWsl + } + l.logger.Warningf("Unsupported platform detected: %v; assuming %v", platform, ModeNvml) + return ModeNvml +} + // getCudaVersion returns the CUDA version of the current system. func (l *nvcdilib) getCudaVersion() (string, error) { version, err := l.getCudaVersionNvsandboxutils() diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/mode.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/mode.go deleted file mode 100644 index 5b8f0369e..000000000 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/mode.go +++ /dev/null @@ -1,119 +0,0 @@ -/** -# Copyright 2024 NVIDIA CORPORATION -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package nvcdi - -import ( - "sync" - - "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" -) - -type Mode string - -const ( - // ModeAuto configures the CDI spec generator to automatically detect the system configuration - ModeAuto = Mode("auto") - // ModeNvml configures the CDI spec generator to use the NVML library. - ModeNvml = Mode("nvml") - // ModeWsl configures the CDI spec generator to generate a WSL spec. - ModeWsl = Mode("wsl") - // ModeManagement configures the CDI spec generator to generate a management spec. - ModeManagement = Mode("management") - // ModeGds configures the CDI spec generator to generate a GDS spec. - ModeGds = Mode("gds") - // ModeMofed configures the CDI spec generator to generate a MOFED spec. - ModeMofed = Mode("mofed") - // ModeCSV configures the CDI spec generator to generate a spec based on the contents of CSV - // mountspec files. - ModeCSV = Mode("csv") - // ModeImex configures the CDI spec generated to generate a spec for the available IMEX channels. - ModeImex = Mode("imex") -) - -type modeConstraint interface { - string | Mode -} - -type modes struct { - lookup map[Mode]bool - all []Mode -} - -var validModes modes -var validModesOnce sync.Once - -func getModes() modes { - validModesOnce.Do(func() { - all := []Mode{ - ModeAuto, - ModeNvml, - ModeWsl, - ModeManagement, - ModeGds, - ModeMofed, - ModeCSV, - } - lookup := make(map[Mode]bool) - - for _, m := range all { - lookup[m] = true - } - - validModes = modes{ - lookup: lookup, - all: all, - } - }, - ) - return validModes -} - -// AllModes returns the set of valid modes. -func AllModes[T modeConstraint]() []T { - var output []T - for _, m := range getModes().all { - output = append(output, T(m)) - } - return output -} - -// IsValidMode checks whether a specified mode is valid. -func IsValidMode[T modeConstraint](mode T) bool { - return getModes().lookup[Mode(mode)] -} - -// resolveMode resolves the mode for CDI spec generation based on the current system. -func (l *nvcdilib) resolveMode() (rmode Mode) { - if l.mode != ModeAuto { - return l.mode - } - defer func() { - l.logger.Infof("Auto-detected mode as '%v'", rmode) - }() - - platform := l.infolib.ResolvePlatform() - switch platform { - case info.PlatformNVML: - return ModeNvml - case info.PlatformTegra: - return ModeCSV - case info.PlatformWSL: - return ModeWsl - } - l.logger.Warningf("Unsupported platform detected: %v; assuming %v", platform, ModeNvml) - return ModeNvml -} diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/options.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/options.go index 362545d25..417687b96 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/options.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/options.go @@ -99,9 +99,9 @@ func WithNvmlLib(nvmllib nvml.Interface) Option { } // WithMode sets the discovery mode for the library -func WithMode[m modeConstraint](mode m) Option { +func WithMode(mode string) Option { return func(l *nvcdilib) { - l.mode = Mode(mode) + l.mode = mode } } diff --git a/vendor/modules.txt b/vendor/modules.txt index 206cdf691..5ebb86127 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -9,8 +9,8 @@ github.com/NVIDIA/go-nvlib/pkg/nvlib/info ## explicit; go 1.20 github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml -# github.com/NVIDIA/nvidia-container-toolkit v1.17.1-0.20250206090248-1d0777ee0165 -## explicit; go 1.22.0 +# github.com/NVIDIA/nvidia-container-toolkit v1.17.5-0.20250306104533-1c13a9647cc1 => ../container-toolkit +## explicit; go 1.22 github.com/NVIDIA/nvidia-container-toolkit/internal/config/image github.com/NVIDIA/nvidia-container-toolkit/internal/discover github.com/NVIDIA/nvidia-container-toolkit/internal/dxcore @@ -834,3 +834,4 @@ tags.cncf.io/container-device-interface/pkg/parser # tags.cncf.io/container-device-interface/specs-go v0.8.0 ## explicit; go 1.19 tags.cncf.io/container-device-interface/specs-go +# github.com/NVIDIA/nvidia-container-toolkit => ../container-toolkit