Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Default to cdi #910

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/nvidia-container-runtime-hook/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func doPrestart() {
return
}

if !hook.NVIDIAContainerRuntimeHookConfig.SkipModeDetection && info.ResolveAutoMode(&logInterceptor{}, hook.NVIDIAContainerRuntimeConfig.Mode, container.Image) != "legacy" {
if !hook.NVIDIAContainerRuntimeHookConfig.SkipModeDetection && info.ResolveAutoMode(&logInterceptor{}, hook.NVIDIAContainerRuntimeConfig.Mode, container.Image) != info.RuntimeModeLegacy {
log.Panicln("invoking the NVIDIA Container Runtime Hook directly (e.g. specifying the docker --gpus flag) is not supported. Please use the NVIDIA Container Runtime (e.g. specify the --runtime=nvidia flag) instead.")
}

Expand Down
8 changes: 4 additions & 4 deletions cmd/nvidia-container-runtime/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ func TestGoodInput(t *testing.T) {
// Check config.json for NVIDIA prestart hook
spec, err = cfg.getRuntimeSpec()
require.NoError(t, err, "should be no errors when reading and parsing spec from config.json")
require.NotEmpty(t, spec.Hooks, "there should be hooks in config.json")
require.Equal(t, 1, nvidiaHookCount(spec.Hooks), "exactly one nvidia prestart hook should be inserted correctly into config.json")
require.Empty(t, spec.Hooks, "there should be hooks in config.json")
require.Equal(t, 0, nvidiaHookCount(spec.Hooks), "exactly one nvidia prestart hook should be inserted correctly into config.json")
}

// NVIDIA prestart hook already present in config file
Expand Down Expand Up @@ -171,8 +171,8 @@ func TestDuplicateHook(t *testing.T) {
// Check config.json for NVIDIA prestart hook
spec, err = cfg.getRuntimeSpec()
require.NoError(t, err, "should be no errors when reading and parsing spec from config.json")
require.NotEmpty(t, spec.Hooks, "there should be hooks in config.json")
require.Equal(t, 1, nvidiaHookCount(spec.Hooks), "exactly one nvidia prestart hook should be inserted correctly into config.json")
require.Empty(t, spec.Hooks, "there should be no hooks in config.json")
require.Equal(t, 0, nvidiaHookCount(spec.Hooks), "exactly one nvidia prestart hook should be inserted correctly into config.json")
}

// addNVIDIAHook is a basic wrapper for an addHookModifier that is used for
Expand Down
24 changes: 17 additions & 7 deletions internal/info/auto.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,32 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
)

// A RuntimeMode is used to select a specific mode of operation for the NVIDIA Container Runtime.
type RuntimeMode string

const (
RuntimeModeLegacy = RuntimeMode("legacy")
RuntimeModeCSV = RuntimeMode("csv")
RuntimeModeCDI = RuntimeMode("cdi")
RuntimeModeJitCDI = RuntimeMode("jit-cdi")
)

// ResolveAutoMode determines the correct mode for the platform if set to "auto"
func ResolveAutoMode(logger logger.Interface, mode string, image image.CUDA) (rmode string) {
func ResolveAutoMode(logger logger.Interface, mode string, image image.CUDA) (rmode RuntimeMode) {
return resolveMode(logger, mode, image, nil)
}

func resolveMode(logger logger.Interface, mode string, image image.CUDA, propertyExtractor info.PropertyExtractor) (rmode string) {
func resolveMode(logger logger.Interface, mode string, image image.CUDA, propertyExtractor info.PropertyExtractor) (rmode RuntimeMode) {
if mode != "auto" {
logger.Infof("Using requested mode '%s'", mode)
return mode
return RuntimeMode(mode)
}
defer func() {
logger.Infof("Auto-detected mode as '%v'", rmode)
}()

if image.OnlyFullyQualifiedCDIDevices() {
return "cdi"
return RuntimeModeCDI
}

nvinfo := info.New(
Expand All @@ -48,9 +58,9 @@ func resolveMode(logger logger.Interface, mode string, image image.CUDA, propert

switch nvinfo.ResolvePlatform() {
case info.PlatformNVML, info.PlatformWSL:
return "legacy"
return RuntimeModeJitCDI
case info.PlatformTegra:
return "csv"
return RuntimeModeCSV
}
return "legacy"
return RuntimeModeJitCDI
}
29 changes: 17 additions & 12 deletions internal/info/auto_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,15 @@ func TestResolveAutoMode(t *testing.T) {
expectedMode: "not-auto",
},
{
description: "no info defaults to legacy",
description: "legacy resolves to legacy",
mode: "legacy",
expectedMode: "legacy",
},
{
description: "no info defaults to jit-cdi",
mode: "auto",
info: map[string]bool{},
expectedMode: "legacy",
expectedMode: "jit-cdi",
},
{
description: "non-nvml, non-tegra, nvgpu resolves to csv",
Expand Down Expand Up @@ -80,14 +85,14 @@ func TestResolveAutoMode(t *testing.T) {
expectedMode: "csv",
},
{
description: "nvml, non-tegra, non-nvgpu resolves to legacy",
description: "nvml, non-tegra, non-nvgpu resolves to jit-cdi",
mode: "auto",
info: map[string]bool{
"nvml": true,
"tegra": false,
"nvgpu": false,
},
expectedMode: "legacy",
expectedMode: "jit-cdi",
},
{
description: "nvml, non-tegra, nvgpu resolves to csv",
Expand All @@ -100,14 +105,14 @@ func TestResolveAutoMode(t *testing.T) {
expectedMode: "csv",
},
{
description: "nvml, tegra, non-nvgpu resolves to legacy",
description: "nvml, tegra, non-nvgpu resolves to jit-cdi",
mode: "auto",
info: map[string]bool{
"nvml": true,
"tegra": true,
"nvgpu": false,
},
expectedMode: "legacy",
expectedMode: "jit-cdi",
},
{
description: "nvml, tegra, nvgpu resolves to csv",
Expand Down Expand Up @@ -136,7 +141,7 @@ func TestResolveAutoMode(t *testing.T) {
},
},
{
description: "at least one non-cdi device resolves to legacy",
description: "at least one non-cdi device resolves to jit-cdi",
mode: "auto",
envmap: map[string]string{
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
Expand All @@ -146,7 +151,7 @@ func TestResolveAutoMode(t *testing.T) {
"tegra": false,
"nvgpu": false,
},
expectedMode: "legacy",
expectedMode: "jit-cdi",
},
{
description: "at least one non-cdi device resolves to csv",
Expand All @@ -170,7 +175,7 @@ func TestResolveAutoMode(t *testing.T) {
expectedMode: "cdi",
},
{
description: "cdi mount and non-CDI devices resolves to legacy",
description: "cdi mount and non-CDI devices resolves to jit-cdi",
mode: "auto",
mounts: []string{
"/var/run/nvidia-container-devices/cdi/nvidia.com/gpu/0",
Expand All @@ -181,10 +186,10 @@ func TestResolveAutoMode(t *testing.T) {
"tegra": false,
"nvgpu": false,
},
expectedMode: "legacy",
expectedMode: "jit-cdi",
},
{
description: "cdi mount and non-CDI envvar resolves to legacy",
description: "cdi mount and non-CDI envvar resolves to jit-cdi",
mode: "auto",
envmap: map[string]string{
"NVIDIA_VISIBLE_DEVICES": "0",
Expand All @@ -197,7 +202,7 @@ func TestResolveAutoMode(t *testing.T) {
"tegra": false,
"nvgpu": false,
},
expectedMode: "legacy",
expectedMode: "jit-cdi",
},
}

Expand Down
59 changes: 40 additions & 19 deletions internal/modifier/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,22 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
)

const (
automaticDeviceVendor = "runtime.nvidia.com"
automaticDeviceClass = "gpu"
automaticDeviceKind = automaticDeviceVendor + "/" + automaticDeviceClass
)

// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
// used to select the devices to include.
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec, isJitCDI bool) (oci.SpecModifier, error) {
defaultKind := cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind
if isJitCDI {
defaultKind = automaticDeviceKind
}

devices, err := getDevicesFromSpec(logger, ociSpec, cfg, defaultKind)
if err != nil {
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
}
Expand Down Expand Up @@ -65,7 +76,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
)
}

func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.Config) ([]string, error) {
func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.Config, defaultKind string) ([]string, error) {
rawSpec, err := ociSpec.Load()
if err != nil {
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
Expand All @@ -83,26 +94,16 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C
if err != nil {
return nil, err
}
if cfg.AcceptDeviceListAsVolumeMounts {
mountDevices := container.CDIDevicesFromMounts()
if len(mountDevices) > 0 {
return mountDevices, nil
}
}

var devices []string
seen := make(map[string]bool)
for _, name := range container.VisibleDevicesFromEnvVar() {
if !parser.IsQualifiedName(name) {
name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name)
}
if seen[name] {
logger.Debugf("Ignoring duplicate device %q", name)
continue
if cfg.AcceptDeviceListAsVolumeMounts {
devices = normalizeDeviceList(logger, defaultKind, append(container.DevicesFromMounts(), container.CDIDevicesFromMounts()...)...)
if len(devices) > 0 {
return devices, nil
}
devices = append(devices, name)
}

devices = normalizeDeviceList(logger, defaultKind, container.VisibleDevicesFromEnvVar()...)
if len(devices) == 0 {
return nil, nil
}
Expand All @@ -116,6 +117,24 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C
return nil, nil
}

func normalizeDeviceList(logger logger.Interface, defaultKind string, devices ...string) []string {
seen := make(map[string]bool)
var normalized []string
for _, name := range devices {
if !parser.IsQualifiedName(name) {
name = fmt.Sprintf("%s=%s", defaultKind, name)
}
if seen[name] {
logger.Debugf("Ignoring duplicate device %q", name)
continue
}
normalized = append(normalized, name)
seen[name] = true
}

return normalized
}

// getAnnotationDevices returns a list of devices specified in the annotations.
// Keys starting with the specified prefixes are considered and expected to contain a comma-separated list of
// fully-qualified CDI devices names. If any device name is not fully-quality an error is returned.
Expand Down Expand Up @@ -156,7 +175,7 @@ func filterAutomaticDevices(devices []string) []string {
var automatic []string
for _, device := range devices {
vendor, class, _ := parser.ParseDevice(device)
if vendor == "runtime.nvidia.com" && class == "gpu" {
if vendor == automaticDeviceVendor && class == automaticDeviceClass {
automatic = append(automatic, device)
}
}
Expand All @@ -165,6 +184,8 @@ func filterAutomaticDevices(devices []string) []string {

func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
// TODO: We should try to load the kernel modules and create the device nodes here.
// Failures should raise a warning and not error out.
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
if err != nil {
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
Expand Down
40 changes: 40 additions & 0 deletions internal/modifier/cdi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,49 @@ import (
"fmt"
"testing"

"github.com/opencontainers/runtime-spec/specs-go"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"

"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
)

func TestGetDevicesFromSpec(t *testing.T) {
logger, _ := testlog.NewNullLogger()
testCases := []struct {
description string
spec *specs.Spec
config *config.Config
defaultKind string
expectedDevices []string
}{
{
description: "NVIDIA_VISIBLE_DEVICES=all",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=all"},
},
},
config: func() *config.Config {
c, _ := config.GetDefault()
return c
}(),
defaultKind: "runtime.nvidia.com/gpu",
expectedDevices: []string{"runtime.nvidia.com/gpu=all"},
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
devices, err := getDevicesFromSpec(logger, oci.NewMemorySpec(tc.spec), tc.config, tc.defaultKind)
require.NoError(t, err)

require.EqualValues(t, tc.expectedDevices, devices)
})
}
}

func TestGetAnnotationDevices(t *testing.T) {
testCases := []struct {
description string
Expand Down
16 changes: 8 additions & 8 deletions internal/runtime/runtime_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,26 +105,26 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
return modifiers, nil
}

func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
func newModeModifier(logger logger.Interface, mode info.RuntimeMode, cfg *config.Config, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
switch mode {
case "legacy":
case info.RuntimeModeLegacy:
return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
case "csv":
case info.RuntimeModeCSV:
return modifier.NewCSVModifier(logger, cfg, image)
case "cdi":
return modifier.NewCDIModifier(logger, cfg, ociSpec)
case info.RuntimeModeCDI, info.RuntimeModeJitCDI:
return modifier.NewCDIModifier(logger, cfg, ociSpec, mode == info.RuntimeModeJitCDI)
}

return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)
}

// supportedModifierTypes returns the modifiers supported for a specific runtime mode.
func supportedModifierTypes(mode string) []string {
func supportedModifierTypes(mode info.RuntimeMode) []string {
switch mode {
case "cdi":
case info.RuntimeModeCDI, info.RuntimeModeJitCDI:
// For CDI mode we make no additional modifications.
return []string{"nvidia-hook-remover", "mode"}
case "csv":
case info.RuntimeModeCSV:
// For CSV mode we support mode and feature-gated modification.
return []string{"nvidia-hook-remover", "mode", "feature-gated"}
default:
Expand Down
1 change: 0 additions & 1 deletion tests/output/bundle/config.json

This file was deleted.