From 74159cf28d1de69947b7bcb4b083db4c2e6c9ff2 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 11 Sep 2024 11:40:34 +0200 Subject: [PATCH 1/8] [no-relnote] Remove unused code Signed-off-by: Evan Lezar --- cmd/nvidia-container-runtime-hook/hook_config.go | 9 --------- internal/config/hook.go | 10 ---------- internal/config/runtime.go | 10 ---------- 3 files changed, 29 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/hook_config.go b/cmd/nvidia-container-runtime-hook/hook_config.go index afdb8bbc..4936af67 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config.go +++ b/cmd/nvidia-container-runtime-hook/hook_config.go @@ -20,15 +20,6 @@ const ( // HookConfig : options for the nvidia-container-runtime-hook. type HookConfig config.Config -func getDefaultHookConfig() (HookConfig, error) { - defaultCfg, err := config.GetDefault() - if err != nil { - return HookConfig{}, err - } - - return *(*HookConfig)(defaultCfg), nil -} - // loadConfig loads the required paths for the hook config. func loadConfig() (*config.Config, error) { var configPaths []string diff --git a/internal/config/hook.go b/internal/config/hook.go index 3292f138..4d8b448f 100644 --- a/internal/config/hook.go +++ b/internal/config/hook.go @@ -24,13 +24,3 @@ type RuntimeHookConfig struct { // SkipModeDetection disables the mode check for the runtime hook. SkipModeDetection bool `toml:"skip-mode-detection"` } - -// GetDefaultRuntimeHookConfig defines the default values for the config -func GetDefaultRuntimeHookConfig() (*RuntimeHookConfig, error) { - cfg, err := GetDefault() - if err != nil { - return nil, err - } - - return &cfg.NVIDIAContainerRuntimeHookConfig, nil -} diff --git a/internal/config/runtime.go b/internal/config/runtime.go index ed9ea646..2ba1b7a8 100644 --- a/internal/config/runtime.go +++ b/internal/config/runtime.go @@ -45,13 +45,3 @@ type cdiModeConfig struct { type csvModeConfig struct { MountSpecPath string `toml:"mount-spec-path"` } - -// GetDefaultRuntimeConfig defines the default values for the config -func GetDefaultRuntimeConfig() (*RuntimeConfig, error) { - cfg, err := GetDefault() - if err != nil { - return nil, err - } - - return &cfg.NVIDIAContainerRuntimeConfig, nil -} From 6385bd5259d50da88ebd6216f5696c72c3a05f51 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 11 Sep 2024 11:56:55 +0200 Subject: [PATCH 2/8] [no-relnote] Refactor config handling for hook This change removes indirect calls to get the default config from the nvidia-container-runtime-hook. Signed-off-by: Evan Lezar --- .../container_config.go | 16 ++-- .../container_config_test.go | 78 +++++++++++-------- .../hook_config.go | 15 ++-- .../hook_config_test.go | 13 ++-- cmd/nvidia-container-runtime-hook/main.go | 2 +- 5 files changed, 72 insertions(+), 52 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/container_config.go b/cmd/nvidia-container-runtime-hook/container_config.go index fa39bf2f..8d382af5 100644 --- a/cmd/nvidia-container-runtime-hook/container_config.go +++ b/cmd/nvidia-container-runtime-hook/container_config.go @@ -236,7 +236,7 @@ func getDevicesFromMounts(mounts []Mount) *string { return &ret } -func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *string { +func (hookConfig *hookConfig) getDevices(image image.CUDA, mounts []Mount, privileged bool) *string { // If enabled, try and get the device list from volume mounts first if hookConfig.AcceptDeviceListAsVolumeMounts { devices := getDevicesFromMounts(mounts) @@ -284,10 +284,10 @@ func getImexChannels(image image.CUDA) *string { return &chans } -func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage bool) image.DriverCapabilities { +func (hookConfig *hookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage bool) image.DriverCapabilities { // We use the default driver capabilities by default. This is filtered to only include the // supported capabilities - supportedDriverCapabilities := image.NewDriverCapabilities(c.SupportedDriverCapabilities) + supportedDriverCapabilities := image.NewDriverCapabilities(hookConfig.SupportedDriverCapabilities) capabilities := supportedDriverCapabilities.Intersection(image.DefaultDriverCapabilities) @@ -311,11 +311,11 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo return capabilities } -func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *nvidiaConfig { +func (hookConfig *hookConfig) getNvidiaConfig(image image.CUDA, mounts []Mount, privileged bool) *nvidiaConfig { legacyImage := image.IsLegacy() var devices string - if d := getDevices(hookConfig, image, mounts, privileged); d != nil { + if d := hookConfig.getDevices(image, mounts, privileged); d != nil { devices = *d } else { // 'nil' devices means this is not a GPU container. @@ -360,7 +360,7 @@ func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, p } } -func getContainerConfig(hook HookConfig) (config containerConfig) { +func (hookConfig *hookConfig) getContainerConfig() (config containerConfig) { var h HookState d := json.NewDecoder(os.Stdin) if err := d.Decode(&h); err != nil { @@ -376,7 +376,7 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { image, err := image.New( image.WithEnv(s.Process.Env), - image.WithDisableRequire(hook.DisableRequire), + image.WithDisableRequire(hookConfig.DisableRequire), ) if err != nil { log.Panicln(err) @@ -387,6 +387,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { Pid: h.Pid, Rootfs: s.Root.Path, Image: image, - Nvidia: getNvidiaConfig(&hook, image, s.Mounts, privileged), + Nvidia: hookConfig.getNvidiaConfig(image, s.Mounts, privileged), } } diff --git a/cmd/nvidia-container-runtime-hook/container_config_test.go b/cmd/nvidia-container-runtime-hook/container_config_test.go index 43fac8aa..270acbaa 100644 --- a/cmd/nvidia-container-runtime-hook/container_config_test.go +++ b/cmd/nvidia-container-runtime-hook/container_config_test.go @@ -7,6 +7,7 @@ import ( "github.com/stretchr/testify/require" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" ) @@ -15,7 +16,7 @@ func TestGetNvidiaConfig(t *testing.T) { description string env map[string]string privileged bool - hookConfig *HookConfig + hookConfig *hookConfig expectedConfig *nvidiaConfig expectedPanic bool }{ @@ -394,8 +395,10 @@ func TestGetNvidiaConfig(t *testing.T) { envNVDriverCapabilities: "all", }, privileged: true, - hookConfig: &HookConfig{ - SupportedDriverCapabilities: "video,display", + hookConfig: &hookConfig{ + Config: &config.Config{ + SupportedDriverCapabilities: "video,display", + }, }, expectedConfig: &nvidiaConfig{ Devices: "all", @@ -409,8 +412,10 @@ func TestGetNvidiaConfig(t *testing.T) { envNVDriverCapabilities: "video,display", }, privileged: true, - hookConfig: &HookConfig{ - SupportedDriverCapabilities: "video,display,compute,utility", + hookConfig: &hookConfig{ + Config: &config.Config{ + SupportedDriverCapabilities: "video,display,compute,utility", + }, }, expectedConfig: &nvidiaConfig{ Devices: "all", @@ -423,8 +428,10 @@ func TestGetNvidiaConfig(t *testing.T) { envNVVisibleDevices: "all", }, privileged: true, - hookConfig: &HookConfig{ - SupportedDriverCapabilities: "video,display,utility,compute", + hookConfig: &hookConfig{ + Config: &config.Config{ + SupportedDriverCapabilities: "video,display,utility,compute", + }, }, expectedConfig: &nvidiaConfig{ Devices: "all", @@ -438,9 +445,11 @@ func TestGetNvidiaConfig(t *testing.T) { "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", }, privileged: true, - hookConfig: &HookConfig{ - SwarmResource: "DOCKER_SWARM_RESOURCE", - SupportedDriverCapabilities: "video,display,utility,compute", + hookConfig: &hookConfig{ + Config: &config.Config{ + SwarmResource: "DOCKER_SWARM_RESOURCE", + SupportedDriverCapabilities: "video,display,utility,compute", + }, }, expectedConfig: &nvidiaConfig{ Devices: "GPU1,GPU2", @@ -454,9 +463,11 @@ func TestGetNvidiaConfig(t *testing.T) { "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", }, privileged: true, - hookConfig: &HookConfig{ - SwarmResource: "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE", - SupportedDriverCapabilities: "video,display,utility,compute", + hookConfig: &hookConfig{ + Config: &config.Config{ + SwarmResource: "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE", + SupportedDriverCapabilities: "video,display,utility,compute", + }, }, expectedConfig: &nvidiaConfig{ Devices: "GPU1,GPU2", @@ -470,14 +481,14 @@ func TestGetNvidiaConfig(t *testing.T) { image.WithEnvMap(tc.env), ) // Wrap the call to getNvidiaConfig() in a closure. - var config *nvidiaConfig + var cfg *nvidiaConfig getConfig := func() { - hookConfig := tc.hookConfig - if hookConfig == nil { - defaultConfig, _ := getDefaultHookConfig() - hookConfig = &defaultConfig + hookCfg := tc.hookConfig + if hookCfg == nil { + defaultConfig, _ := config.GetDefault() + hookCfg = &hookConfig{defaultConfig} } - config = getNvidiaConfig(hookConfig, image, nil, tc.privileged) + cfg = hookCfg.getNvidiaConfig(image, nil, tc.privileged) } // For any tests that are expected to panic, make sure they do. @@ -491,18 +502,18 @@ func TestGetNvidiaConfig(t *testing.T) { // And start comparing the test results to the expected results. if tc.expectedConfig == nil { - require.Nil(t, config, tc.description) + require.Nil(t, cfg, tc.description) return } - require.NotNil(t, config, tc.description) + require.NotNil(t, cfg, tc.description) - require.Equal(t, tc.expectedConfig.Devices, config.Devices) - require.Equal(t, tc.expectedConfig.MigConfigDevices, config.MigConfigDevices) - require.Equal(t, tc.expectedConfig.MigMonitorDevices, config.MigMonitorDevices) - require.Equal(t, tc.expectedConfig.DriverCapabilities, config.DriverCapabilities) + require.Equal(t, tc.expectedConfig.Devices, cfg.Devices) + require.Equal(t, tc.expectedConfig.MigConfigDevices, cfg.MigConfigDevices) + require.Equal(t, tc.expectedConfig.MigMonitorDevices, cfg.MigMonitorDevices) + require.Equal(t, tc.expectedConfig.DriverCapabilities, cfg.DriverCapabilities) - require.ElementsMatch(t, tc.expectedConfig.Requirements, config.Requirements) + require.ElementsMatch(t, tc.expectedConfig.Requirements, cfg.Requirements) }) } } @@ -689,10 +700,11 @@ func TestDeviceListSourcePriority(t *testing.T) { }, ), ) - hookConfig, _ := getDefaultHookConfig() - hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged - hookConfig.AcceptDeviceListAsVolumeMounts = tc.acceptMounts - devices = getDevices(&hookConfig, image, tc.mountDevices, tc.privileged) + defaultConfig, _ := config.GetDefault() + cfg := &hookConfig{defaultConfig} + cfg.AcceptEnvvarUnprivileged = tc.acceptUnprivileged + cfg.AcceptDeviceListAsVolumeMounts = tc.acceptMounts + devices = cfg.getDevices(image, tc.mountDevices, tc.privileged) } // For all other tests, just grab the devices and check the results @@ -1028,8 +1040,10 @@ func TestGetDriverCapabilities(t *testing.T) { t.Run(tc.description, func(t *testing.T) { var capabilities string - c := HookConfig{ - SupportedDriverCapabilities: tc.supportedCapabilities, + c := hookConfig{ + Config: &config.Config{ + SupportedDriverCapabilities: tc.supportedCapabilities, + }, } image, _ := image.New( diff --git a/cmd/nvidia-container-runtime-hook/hook_config.go b/cmd/nvidia-container-runtime-hook/hook_config.go index 4936af67..f88815e5 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config.go +++ b/cmd/nvidia-container-runtime-hook/hook_config.go @@ -17,8 +17,11 @@ const ( driverPath = "/run/nvidia/driver" ) -// HookConfig : options for the nvidia-container-runtime-hook. -type HookConfig config.Config +// hookConfig wraps the toolkit config. +// This allows for functions to be defined on the local type. +type hookConfig struct { + *config.Config +} // loadConfig loads the required paths for the hook config. func loadConfig() (*config.Config, error) { @@ -47,12 +50,12 @@ func loadConfig() (*config.Config, error) { return config.GetDefault() } -func getHookConfig() (*HookConfig, error) { +func getHookConfig() (*hookConfig, error) { cfg, err := loadConfig() if err != nil { return nil, fmt.Errorf("failed to load config: %v", err) } - config := (*HookConfig)(cfg) + config := &hookConfig{cfg} allSupportedDriverCapabilities := image.SupportedDriverCapabilities if config.SupportedDriverCapabilities == "all" { @@ -70,7 +73,7 @@ func getHookConfig() (*HookConfig, error) { // getConfigOption returns the toml config option associated with the // specified struct field. -func (c HookConfig) getConfigOption(fieldName string) string { +func (c hookConfig) getConfigOption(fieldName string) string { t := reflect.TypeOf(c) f, ok := t.FieldByName(fieldName) if !ok { @@ -84,7 +87,7 @@ func (c HookConfig) getConfigOption(fieldName string) string { } // getSwarmResourceEnvvars returns the swarm resource envvars for the config. -func (c *HookConfig) getSwarmResourceEnvvars() []string { +func (c *hookConfig) getSwarmResourceEnvvars() []string { if c.SwarmResource == "" { return nil } diff --git a/cmd/nvidia-container-runtime-hook/hook_config_test.go b/cmd/nvidia-container-runtime-hook/hook_config_test.go index 7c50ec12..ade45d27 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config_test.go +++ b/cmd/nvidia-container-runtime-hook/hook_config_test.go @@ -23,6 +23,7 @@ import ( "github.com/stretchr/testify/require" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" ) @@ -89,10 +90,10 @@ func TestGetHookConfig(t *testing.T) { } } - var config HookConfig + var cfg hookConfig getHookConfig := func() { c, _ := getHookConfig() - config = *c + cfg = *c } if tc.expectedPanic { @@ -102,7 +103,7 @@ func TestGetHookConfig(t *testing.T) { getHookConfig() - require.EqualValues(t, tc.expectedDriverCapabilities, config.SupportedDriverCapabilities) + require.EqualValues(t, tc.expectedDriverCapabilities, cfg.SupportedDriverCapabilities) }) } } @@ -144,8 +145,10 @@ func TestGetSwarmResourceEnvvars(t *testing.T) { for i, tc := range testCases { t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { - c := &HookConfig{ - SwarmResource: tc.value, + c := &hookConfig{ + Config: &config.Config{ + SwarmResource: tc.value, + }, } envvars := c.getSwarmResourceEnvvars() diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index faaf0b51..886540e8 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -75,7 +75,7 @@ func doPrestart() { } cli := hook.NVIDIAContainerCLIConfig - container := getContainerConfig(*hook) + container := hook.getContainerConfig() nvidia := container.Nvidia if nvidia == nil { // Not a GPU container, nothing to do. From 72a0400a68766501340dbad90fb8fad70113c0e7 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 11 Sep 2024 16:47:16 +0200 Subject: [PATCH 3/8] Only allow host-relative LDConfig paths This change only allows host-relative LDConfig paths. An allow-ldconfig-from-container feature flag is added to allow for this the default behaviour to be changed. Signed-off-by: Evan Lezar --- internal/config/config.go | 17 +++- internal/config/config_test.go | 161 +++++++++++++++++++++++++++------ internal/config/features.go | 66 ++++++++------ internal/config/toml.go | 13 +++ internal/config/toml_test.go | 31 ++++++- internal/modifier/gated.go | 8 +- 6 files changed, 230 insertions(+), 66 deletions(-) diff --git a/internal/config/config.go b/internal/config/config.go index 33b8ba4d..a6646846 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -18,6 +18,8 @@ package config import ( "bufio" + "errors" + "fmt" "os" "path/filepath" "strings" @@ -51,6 +53,8 @@ var ( NVIDIAContainerToolkitExecutable = "nvidia-container-toolkit" ) +var errInvalidConfig = errors.New("invalid config value") + // Config represents the contents of the config.toml file for the NVIDIA Container Toolkit // Note: This is currently duplicated by the HookConfig in cmd/nvidia-container-toolkit/hook_config.go type Config struct { @@ -127,7 +131,18 @@ func GetDefault() (*Config, error) { return &d, nil } -func getLdConfigPath() string { +// assertValid checks for a valid config. +func (c *Config) assertValid() error { + if !c.Features.AllowLDConfigFromContainer.IsEnabled() && !strings.HasPrefix(c.NVIDIAContainerCLIConfig.Ldconfig, "@") { + return fmt.Errorf("%w: nvidia-container-cli.ldconfig value %q is not host-relative (does not start with a '@')", errInvalidConfig, c.NVIDIAContainerCLIConfig.Ldconfig) + } + return nil +} + +// getLdConfigPath allows us to override this function for testing. +var getLdConfigPath = getLdConfigPathStub + +func getLdConfigPathStub() string { return NormalizeLDConfigPath("@/sbin/ldconfig") } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 0873ebd2..67f132f7 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -44,23 +44,21 @@ func TestGetConfigWithCustomConfig(t *testing.T) { func TestGetConfig(t *testing.T) { testCases := []struct { - description string - contents []string - expectedError error - inspectLdconfig bool - distIdsLike []string - expectedConfig *Config + description string + contents []string + expectedError error + distIdsLike []string + expectedConfig *Config }{ { - description: "empty config is default", - inspectLdconfig: true, + description: "empty config is default", expectedConfig: &Config{ AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ Root: "", LoadKmods: true, - Ldconfig: "WAS_CHECKED", + Ldconfig: "@/test/ld/config/path", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/dev/null", @@ -93,7 +91,7 @@ func TestGetConfig(t *testing.T) { "supported-driver-capabilities = \"compute,utility\"", "nvidia-container-cli.root = \"/bar/baz\"", "nvidia-container-cli.load-kmods = false", - "nvidia-container-cli.ldconfig = \"/foo/bar/ldconfig\"", + "nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"", "nvidia-container-cli.user = \"foo:bar\"", "nvidia-container-runtime.debug = \"/foo/bar\"", "nvidia-container-runtime.discover-mode = \"not-legacy\"", @@ -113,7 +111,7 @@ func TestGetConfig(t *testing.T) { NVIDIAContainerCLIConfig: ContainerCLIConfig{ Root: "/bar/baz", LoadKmods: false, - Ldconfig: "/foo/bar/ldconfig", + Ldconfig: "@/foo/bar/ldconfig", User: "foo:bar", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ @@ -146,6 +144,53 @@ func TestGetConfig(t *testing.T) { }, }, }, + { + description: "feature allows ldconfig to be overridden", + contents: []string{ + "[nvidia-container-cli]", + "ldconfig = \"/foo/bar/ldconfig\"", + "[features]", + "allow-ldconfig-from-container = true", + }, + expectedConfig: &Config{ + AcceptEnvvarUnprivileged: true, + SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", + NVIDIAContainerCLIConfig: ContainerCLIConfig{ + Ldconfig: "/foo/bar/ldconfig", + LoadKmods: true, + }, + NVIDIAContainerRuntimeConfig: RuntimeConfig{ + DebugFilePath: "/dev/null", + LogLevel: "info", + Runtimes: []string{"docker-runc", "runc", "crun"}, + Mode: "auto", + Modes: modesConfig{ + CSV: csvModeConfig{ + MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d", + }, + CDI: cdiModeConfig{ + DefaultKind: "nvidia.com/gpu", + AnnotationPrefixes: []string{ + "cdi.k8s.io/", + }, + SpecDirs: []string{ + "/etc/cdi", + "/var/run/cdi", + }, + }, + }, + }, + NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ + Path: "nvidia-container-runtime-hook", + }, + NVIDIACTKConfig: CTKConfig{ + Path: "nvidia-ctk", + }, + Features: features{ + AllowLDConfigFromContainer: ptr(feature(true)), + }, + }, + }, { description: "config options set in section", contents: []string{ @@ -154,7 +199,7 @@ func TestGetConfig(t *testing.T) { "[nvidia-container-cli]", "root = \"/bar/baz\"", "load-kmods = false", - "ldconfig = \"/foo/bar/ldconfig\"", + "ldconfig = \"@/foo/bar/ldconfig\"", "user = \"foo:bar\"", "[nvidia-container-runtime]", "debug = \"/foo/bar\"", @@ -179,7 +224,7 @@ func TestGetConfig(t *testing.T) { NVIDIAContainerCLIConfig: ContainerCLIConfig{ Root: "/bar/baz", LoadKmods: false, - Ldconfig: "/foo/bar/ldconfig", + Ldconfig: "@/foo/bar/ldconfig", User: "foo:bar", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ @@ -213,16 +258,15 @@ func TestGetConfig(t *testing.T) { }, }, { - description: "suse config", - distIdsLike: []string{"suse", "opensuse"}, - inspectLdconfig: true, + description: "suse config", + distIdsLike: []string{"suse", "opensuse"}, expectedConfig: &Config{ AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ Root: "", LoadKmods: true, - Ldconfig: "WAS_CHECKED", + Ldconfig: "@/test/ld/config/path", User: "root:video", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ @@ -250,9 +294,8 @@ func TestGetConfig(t *testing.T) { }, }, { - description: "suse config overrides user", - distIdsLike: []string{"suse", "opensuse"}, - inspectLdconfig: true, + description: "suse config overrides user", + distIdsLike: []string{"suse", "opensuse"}, contents: []string{ "nvidia-container-cli.user = \"foo:bar\"", }, @@ -262,7 +305,7 @@ func TestGetConfig(t *testing.T) { NVIDIAContainerCLIConfig: ContainerCLIConfig{ Root: "", LoadKmods: true, - Ldconfig: "WAS_CHECKED", + Ldconfig: "@/test/ld/config/path", User: "foo:bar", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ @@ -293,6 +336,7 @@ func TestGetConfig(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { + defer setGetLdConfigPathForTest()() defer setGetDistIDLikeForTest(tc.distIdsLike)() reader := strings.NewReader(strings.Join(tc.contents, "\n")) @@ -305,17 +349,59 @@ func TestGetConfig(t *testing.T) { cfg, err := tomlCfg.Config() require.NoError(t, err) - // We first handle the ldconfig path since this is currently system-dependent. - if tc.inspectLdconfig { - ldconfig := cfg.NVIDIAContainerCLIConfig.Ldconfig - require.True(t, strings.HasPrefix(ldconfig, "@/sbin/ldconfig")) - remaining := strings.TrimPrefix(ldconfig, "@/sbin/ldconfig") - require.True(t, remaining == ".real" || remaining == "") + require.EqualValues(t, tc.expectedConfig, cfg) + }) + } +} - cfg.NVIDIAContainerCLIConfig.Ldconfig = "WAS_CHECKED" - } +func TestAssertValid(t *testing.T) { + defer setGetLdConfigPathForTest()() - require.EqualValues(t, tc.expectedConfig, cfg) + testCases := []struct { + description string + config *Config + expectedError error + }{ + { + description: "default is valid", + config: func() *Config { + config, _ := GetDefault() + return config + }(), + }, + { + description: "alternative host ldconfig path is valid", + config: &Config{ + NVIDIAContainerCLIConfig: ContainerCLIConfig{ + Ldconfig: "@/some/host/path", + }, + }, + }, + { + description: "non-host path is invalid", + config: &Config{ + NVIDIAContainerCLIConfig: ContainerCLIConfig{ + Ldconfig: "/non/host/path", + }, + }, + expectedError: errInvalidConfig, + }, + { + description: "feature flag allows non-host path", + config: &Config{ + NVIDIAContainerCLIConfig: ContainerCLIConfig{ + Ldconfig: "/non/host/path", + }, + Features: features{ + AllowLDConfigFromContainer: ptr(feature(true)), + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + require.ErrorIs(t, tc.config.assertValid(), tc.expectedError) }) } } @@ -335,3 +421,18 @@ func setGetDistIDLikeForTest(ids []string) func() { getDistIDLike = original } } + +// prt returns a reference to whatever type is passed into it +func ptr[T any](x T) *T { + return &x +} + +func setGetLdConfigPathForTest() func() { + previous := getLdConfigPath + getLdConfigPath = func() string { + return "@/test/ld/config/path" + } + return func() { + getLdConfigPath = previous + } +} diff --git a/internal/config/features.go b/internal/config/features.go index dfc6b165..240f6f87 100644 --- a/internal/config/features.go +++ b/internal/config/features.go @@ -19,10 +19,11 @@ package config type featureName string const ( - FeatureGDS = featureName("gds") - FeatureMOFED = featureName("mofed") - FeatureNVSWITCH = featureName("nvswitch") - FeatureGDRCopy = featureName("gdrcopy") + FeatureGDS = featureName("gds") + FeatureMOFED = featureName("mofed") + FeatureNVSWITCH = featureName("nvswitch") + FeatureGDRCopy = featureName("gdrcopy") + FeatureAllowLDConfigFromContainer = featureName("allow-ldconfig-from-container") ) // features specifies a set of named features. @@ -31,53 +32,58 @@ type features struct { MOFED *feature `toml:"mofed,omitempty"` NVSWITCH *feature `toml:"nvswitch,omitempty"` GDRCopy *feature `toml:"gdrcopy,omitempty"` + // AllowLDConfigFromContainer allows non-host ldconfig paths to be used. + // If this feature flag is not set to 'true' only host-rooted config paths + // (i.e. paths starting with an '@' are considered valid) + AllowLDConfigFromContainer *feature `toml:"allow-ldconfig-from-container,omitempty"` } type feature bool -// IsEnabled checks whether a specified named feature is enabled. +// IsEnabledInEnvironment checks whether a specified named feature is enabled. // An optional list of environments to check for feature-specific environment // variables can also be supplied. -func (fs features) IsEnabled(n featureName, in ...getenver) bool { - featureEnvvars := map[featureName]string{ - FeatureGDS: "NVIDIA_GDS", - FeatureMOFED: "NVIDIA_MOFED", - FeatureNVSWITCH: "NVIDIA_NVSWITCH", - FeatureGDRCopy: "NVIDIA_GDRCOPY", - } - - envvar := featureEnvvars[n] +func (fs features) IsEnabledInEnvironment(n featureName, in ...getenver) bool { switch n { + // Features with envvar overrides case FeatureGDS: - return fs.GDS.isEnabled(envvar, in...) + return fs.GDS.isEnabledWithEnvvarOverride("NVIDIA_GDS", in...) case FeatureMOFED: - return fs.MOFED.isEnabled(envvar, in...) + return fs.MOFED.isEnabledWithEnvvarOverride("NVIDIA_MOFED", in...) case FeatureNVSWITCH: - return fs.NVSWITCH.isEnabled(envvar, in...) + return fs.NVSWITCH.isEnabledWithEnvvarOverride("NVIDIA_NVSWITCH", in...) case FeatureGDRCopy: - return fs.GDRCopy.isEnabled(envvar, in...) + return fs.GDRCopy.isEnabledWithEnvvarOverride("NVIDIA_GDRCOPY", in...) + // Features without envvar overrides + case FeatureAllowLDConfigFromContainer: + return fs.AllowLDConfigFromContainer.IsEnabled() default: return false } } -// isEnabled checks whether a feature is enabled. -// If the enabled value is explicitly set, this is returned, otherwise the -// associated envvar is checked in the specified getenver for the string "enabled" -// A CUDA container / image can be passed here. -func (f *feature) isEnabled(envvar string, ins ...getenver) bool { +// IsEnabled checks whether a feature is enabled. +func (f *feature) IsEnabled() bool { if f != nil { return bool(*f) } - if envvar == "" { - return false - } - for _, in := range ins { - if in.Getenv(envvar) == "enabled" { - return true + return false +} + +// isEnabledWithEnvvarOverride checks whether a feature is enabled and allows an envvar to overide the feature. +// If the enabled value is explicitly set, this is returned, otherwise the +// associated envvar is checked in the specified getenver for the string "enabled" +// A CUDA container / image can be passed here. +func (f *feature) isEnabledWithEnvvarOverride(envvar string, ins ...getenver) bool { + if envvar != "" { + for _, in := range ins { + if in.Getenv(envvar) == "enabled" { + return true + } } } - return false + + return f.IsEnabled() } type getenver interface { diff --git a/internal/config/toml.go b/internal/config/toml.go index a1d37428..4df2a099 100644 --- a/internal/config/toml.go +++ b/internal/config/toml.go @@ -108,6 +108,19 @@ func loadConfigTomlFrom(reader io.Reader) (*Toml, error) { // Config returns the typed config associated with the toml tree. func (t *Toml) Config() (*Config, error) { + cfg, err := t.configNoOverrides() + if err != nil { + return nil, err + } + if err := cfg.assertValid(); err != nil { + return nil, err + } + return cfg, nil +} + +// configNoOverrides returns the typed config associated with the toml tree. +// This config does not include feature-specific overrides. +func (t *Toml) configNoOverrides() (*Config, error) { cfg, err := GetDefault() if err != nil { return nil, err diff --git a/internal/config/toml_test.go b/internal/config/toml_test.go index e017db15..f7c649f7 100644 --- a/internal/config/toml_test.go +++ b/internal/config/toml_test.go @@ -198,9 +198,12 @@ func TestTomlContents(t *testing.T) { } func TestConfigFromToml(t *testing.T) { + defer setGetLdConfigPathForTest()() + testCases := []struct { description string contents map[string]interface{} + expectedError error expectedConfig *Config }{ { @@ -226,13 +229,39 @@ func TestConfigFromToml(t *testing.T) { return c }(), }, + { + description: "invalid ldconfig value raises error", + contents: map[string]interface{}{ + "nvidia-container-cli": map[string]interface{}{ + "ldconfig": "/some/ldconfig/path", + }, + }, + expectedError: errInvalidConfig, + }, + { + description: "feature allows ldconfig override", + contents: map[string]interface{}{ + "nvidia-container-cli": map[string]interface{}{ + "ldconfig": "/some/ldconfig/path", + }, + "features": map[string]interface{}{ + "allow-ldconfig-from-container": true, + }, + }, + expectedConfig: func() *Config { + c, _ := GetDefault() + c.NVIDIAContainerCLIConfig.Ldconfig = "/some/ldconfig/path" + c.Features.AllowLDConfigFromContainer = ptr(feature(true)) + return c + }(), + }, } for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { tomlCfg := fromMap(tc.contents) config, err := tomlCfg.Config() - require.NoError(t, err) + require.ErrorIs(t, err, tc.expectedError) require.EqualValues(t, tc.expectedConfig, config) }) } diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index 5bed3eaf..70322b35 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -46,7 +46,7 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image driverRoot := cfg.NVIDIAContainerCLIConfig.Root devRoot := cfg.NVIDIAContainerCLIConfig.Root - if cfg.Features.IsEnabled(config.FeatureGDS, image) { + if cfg.Features.IsEnabledInEnvironment(config.FeatureGDS, image) { d, err := discover.NewGDSDiscoverer(logger, driverRoot, devRoot) if err != nil { return nil, fmt.Errorf("failed to construct discoverer for GDS devices: %w", err) @@ -54,7 +54,7 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } - if cfg.Features.IsEnabled(config.FeatureMOFED, image) { + if cfg.Features.IsEnabledInEnvironment(config.FeatureMOFED, image) { d, err := discover.NewMOFEDDiscoverer(logger, devRoot) if err != nil { return nil, fmt.Errorf("failed to construct discoverer for MOFED devices: %w", err) @@ -62,7 +62,7 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } - if cfg.Features.IsEnabled(config.FeatureNVSWITCH, image) { + if cfg.Features.IsEnabledInEnvironment(config.FeatureNVSWITCH, image) { d, err := discover.NewNvSwitchDiscoverer(logger, devRoot) if err != nil { return nil, fmt.Errorf("failed to construct discoverer for NVSWITCH devices: %w", err) @@ -70,7 +70,7 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } - if cfg.Features.IsEnabled(config.FeatureGDRCopy, image) { + if cfg.Features.IsEnabledInEnvironment(config.FeatureGDRCopy, image) { d, err := discover.NewGDRCopyDiscoverer(logger, devRoot) if err != nil { return nil, fmt.Errorf("failed to construct discoverer for GDRCopy devices: %w", err) From c1f35807eaaafdcb2173e3f3e3db1a0b8a838ce1 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 16 Sep 2024 11:05:35 +0200 Subject: [PATCH 4/8] Skip injection of nvidia-persistenced socket by default This changes skips the injection of the nvidia-persistenced socket by default. An include-persistenced-socket feature flag is added to allow the injection of this socket to be explicitly requested. Signed-off-by: Evan Lezar --- cmd/nvidia-container-runtime-hook/main.go | 6 ++++++ internal/config/features.go | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index 886540e8..90e488d1 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -89,6 +89,12 @@ func doPrestart() { rootfs := getRootfsPath(container) args := []string{getCLIPath(cli)} + + // Only include the nvidia-persistenced socket if it is explicitly enabled. + if !hook.Features.IncludePersistencedSocket.IsEnabled() { + args = append(args, "--no-persistenced") + } + if cli.Root != "" { args = append(args, fmt.Sprintf("--root=%s", cli.Root)) } diff --git a/internal/config/features.go b/internal/config/features.go index 240f6f87..bde12191 100644 --- a/internal/config/features.go +++ b/internal/config/features.go @@ -24,6 +24,7 @@ const ( FeatureNVSWITCH = featureName("nvswitch") FeatureGDRCopy = featureName("gdrcopy") FeatureAllowLDConfigFromContainer = featureName("allow-ldconfig-from-container") + FeatureIncludePersistencedSocket = featureName("include-persistenced-socket") ) // features specifies a set of named features. @@ -36,6 +37,9 @@ type features struct { // If this feature flag is not set to 'true' only host-rooted config paths // (i.e. paths starting with an '@' are considered valid) AllowLDConfigFromContainer *feature `toml:"allow-ldconfig-from-container,omitempty"` + // IncludePersistencedSocket enables the injection of the nvidia-persistenced + // socket into containers. + IncludePersistencedSocket *feature `toml:"include-persistenced-socket,omitempty"` } type feature bool @@ -57,6 +61,8 @@ func (fs features) IsEnabledInEnvironment(n featureName, in ...getenver) bool { // Features without envvar overrides case FeatureAllowLDConfigFromContainer: return fs.AllowLDConfigFromContainer.IsEnabled() + case FeatureIncludePersistencedSocket: + return fs.IncludePersistencedSocket.IsEnabled() default: return false } From 854c815056b27dccfda1a63056e1af4eb4a48a1c Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 16 Sep 2024 11:19:02 +0200 Subject: [PATCH 5/8] Use include-persistenced-socket feature for CDI mode This change ensures that the internal CDI representation includes the persistenced socket if the include-persistenced-socket feature flag is enabled. Signed-off-by: Evan Lezar --- internal/discover/ipc.go | 12 +++++++----- internal/modifier/cdi.go | 1 + pkg/nvcdi/common-nvml.go | 2 +- pkg/nvcdi/driver-nvml.go | 22 +++++++++++----------- pkg/nvcdi/lib.go | 2 ++ pkg/nvcdi/management.go | 2 +- pkg/nvcdi/options.go | 11 +++++++++++ 7 files changed, 34 insertions(+), 18 deletions(-) diff --git a/internal/discover/ipc.go b/internal/discover/ipc.go index f636290f..6661bb9e 100644 --- a/internal/discover/ipc.go +++ b/internal/discover/ipc.go @@ -24,7 +24,12 @@ import ( type ipcMounts mounts // NewIPCDiscoverer creats a discoverer for NVIDIA IPC sockets. -func NewIPCDiscoverer(logger logger.Interface, driverRoot string) (Discover, error) { +func NewIPCDiscoverer(logger logger.Interface, driverRoot string, includePersistencedSocket bool) (Discover, error) { + var requiredSockets []string + if includePersistencedSocket { + requiredSockets = append(requiredSockets, "/nvidia-persistenced/socket") + } + sockets := newMounts( logger, lookup.NewFileLocator( @@ -34,10 +39,7 @@ func NewIPCDiscoverer(logger logger.Interface, driverRoot string) (Discover, err lookup.WithCount(1), ), driverRoot, - []string{ - "/nvidia-persistenced/socket", - "/nvidia-fabricmanager/socket", - }, + requiredSockets, ) mps := newMounts( diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index c5af4f88..d3e4e935 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -189,6 +189,7 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root), nvcdi.WithVendor("runtime.nvidia.com"), nvcdi.WithClass("gpu"), + nvcdi.WithOptInFeature("include-persistenced-socket", cfg.Features.IncludePersistencedSocket.IsEnabled()), ) if err != nil { return nil, fmt.Errorf("failed to construct CDI library: %w", err) diff --git a/pkg/nvcdi/common-nvml.go b/pkg/nvcdi/common-nvml.go index 4dd1bc35..6e9661cb 100644 --- a/pkg/nvcdi/common-nvml.go +++ b/pkg/nvcdi/common-nvml.go @@ -41,7 +41,7 @@ func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) { l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err) } - driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, l.nvmllib) + driverFiles, err := l.NewDriverDiscoverer() if err != nil { return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err) } diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index 8fb39888..59ff91c2 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -34,41 +34,41 @@ import ( // NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation. // The supplied NVML Library is used to query the expected driver version. -func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, nvmllib nvml.Interface) (discover.Discover, error) { - if r := nvmllib.Init(); r != nvml.SUCCESS { +func (l *nvmllib) NewDriverDiscoverer() (discover.Discover, error) { + if r := l.nvmllib.Init(); r != nvml.SUCCESS { return nil, fmt.Errorf("failed to initialize NVML: %v", r) } defer func() { - if r := nvmllib.Shutdown(); r != nvml.SUCCESS { - logger.Warningf("failed to shutdown NVML: %v", r) + if r := l.nvmllib.Shutdown(); r != nvml.SUCCESS { + l.logger.Warningf("failed to shutdown NVML: %v", r) } }() - version, r := nvmllib.SystemGetDriverVersion() + version, r := l.nvmllib.SystemGetDriverVersion() if r != nvml.SUCCESS { return nil, fmt.Errorf("failed to determine driver version: %v", r) } - return newDriverVersionDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version) + return (*nvcdilib)(l).newDriverVersionDiscoverer(version) } -func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) { - libraries, err := NewDriverLibraryDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version) +func (l *nvcdilib) newDriverVersionDiscoverer(version string) (discover.Discover, error) { + libraries, err := NewDriverLibraryDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, version) if err != nil { return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err) } - ipcs, err := discover.NewIPCDiscoverer(logger, driver.Root) + ipcs, err := discover.NewIPCDiscoverer(l.logger, l.driver.Root, l.optInFeatures["include-persistenced-socket"]) if err != nil { return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err) } - firmwares, err := NewDriverFirmwareDiscoverer(logger, driver.Root, version) + firmwares, err := NewDriverFirmwareDiscoverer(l.logger, l.driver.Root, version) if err != nil { return nil, fmt.Errorf("failed to create discoverer for GSP firmware: %v", err) } - binaries := NewDriverBinariesDiscoverer(logger, driver.Root) + binaries := NewDriverBinariesDiscoverer(l.logger, l.driver.Root) d := discover.Merge( libraries, diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index d2db3b6c..e14b2971 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -63,6 +63,8 @@ type nvcdilib struct { infolib info.Interface mergedDeviceOptions []transform.MergedDeviceOption + + optInFeatures map[string]bool } // New creates a new nvcdi library diff --git a/pkg/nvcdi/management.go b/pkg/nvcdi/management.go index 4648e5bb..8f7709af 100644 --- a/pkg/nvcdi/management.go +++ b/pkg/nvcdi/management.go @@ -66,7 +66,7 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { return nil, fmt.Errorf("failed to get CUDA version: %v", err) } - driver, err := newDriverVersionDiscoverer(m.logger, m.driver, m.nvidiaCDIHookPath, m.ldconfigPath, version) + driver, err := (*nvcdilib)(m).newDriverVersionDiscoverer(version) if err != nil { return nil, fmt.Errorf("failed to create driver library discoverer: %v", err) } diff --git a/pkg/nvcdi/options.go b/pkg/nvcdi/options.go index 417687b9..762b5ac6 100644 --- a/pkg/nvcdi/options.go +++ b/pkg/nvcdi/options.go @@ -155,3 +155,14 @@ func WithLibrarySearchPaths(paths []string) Option { o.librarySearchPaths = paths } } + +// WithOptInFeature sets a specific opt-in feature. +// Note that previous opt-in-features are not removed. +func WithOptInFeature(feature string, enabled bool) Option { + return func(n *nvcdilib) { + if n.optInFeatures == nil { + n.optInFeatures = make(map[string]bool) + } + n.optInFeatures[feature] = enabled + } +} From fa0ec9d0b14869ce09d91b3938830752cf630ffe Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 16 Sep 2024 11:27:57 +0200 Subject: [PATCH 6/8] Allow inclusion of persistenced socket in CDI specification This change adds an include-persistenced-socket flag to the nvidia-ctk cdi generate command that ensures that a generated specification includes the nvidia-persistenced socket if present on the host. Note that for mangement mode, these sockets are always included if detected. Signed-off-by: Evan Lezar --- cmd/nvidia-ctk/cdi/generate/generate.go | 8 ++++++++ pkg/nvcdi/lib.go | 3 +++ 2 files changed, 11 insertions(+) diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index 9f9e994b..9c2da04f 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -60,6 +60,8 @@ type options struct { files cli.StringSlice ignorePatterns cli.StringSlice } + + includePersistencedSocket bool } // NewCommand constructs a generate-cdi command with the specified logger @@ -169,6 +171,11 @@ func (m command) build() *cli.Command { Usage: "Specify a pattern the CSV mount specifications.", Destination: &opts.csv.ignorePatterns, }, + &cli.BoolFlag{ + Name: "include-persistenced-socket", + Usage: "Include the nvidia-persistenced socket in the generated CDI specification.", + Destination: &opts.includePersistencedSocket, + }, } return &c @@ -273,6 +280,7 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) { nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()), nvcdi.WithCSVFiles(opts.csv.files.Value()), nvcdi.WithCSVIgnorePatterns(opts.csv.ignorePatterns.Value()), + nvcdi.WithOptInFeature("include-persistenced-socket", opts.includePersistencedSocket), ) if err != nil { return nil, fmt.Errorf("failed to create CDI library: %v", err) diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index e14b2971..e535763b 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -132,6 +132,9 @@ func New(opts ...Option) (Interface, error) { if l.vendor == "" { l.vendor = "management.nvidia.com" } + // For management specifications we always allow the fabricmanager and + // persistenced sockets. + WithOptInFeature("include-persistenced-socket", true)(l) lib = (*managementlib)(l) case ModeNvml: lib = (*nvmllib)(l) From 30921c2dae233201ceac3e553e0ab2d6fca6c6ed Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 16 Sep 2024 15:42:59 +0200 Subject: [PATCH 7/8] [no-relnote] Move firmware functions to separate file Signed-off-by: Evan Lezar --- pkg/nvcdi/firmware.go | 95 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 pkg/nvcdi/firmware.go diff --git a/pkg/nvcdi/firmware.go b/pkg/nvcdi/firmware.go new file mode 100644 index 00000000..7d96422d --- /dev/null +++ b/pkg/nvcdi/firmware.go @@ -0,0 +1,95 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvcdi + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "golang.org/x/sys/unix" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" +) + +// newDriverFirmwareDiscoverer creates a discoverer for GSP firmware associated with the specified driver version. +func (l *nvcdilib) newDriverFirmwareDiscoverer(logger logger.Interface, driverRoot string, version string) (discover.Discover, error) { + if !l.optInFeatures["allow-gsp-firmware"] { + return discover.None{}, nil + } + + gspFirmwareSearchPaths, err := getFirmwareSearchPaths(logger) + if err != nil { + return nil, fmt.Errorf("failed to get firmware search paths: %v", err) + } + gspFirmwarePaths := filepath.Join("nvidia", version, "gsp*.bin") + return discover.NewMounts( + logger, + lookup.NewFileLocator( + lookup.WithLogger(logger), + lookup.WithRoot(driverRoot), + lookup.WithSearchPaths(gspFirmwareSearchPaths...), + ), + driverRoot, + []string{gspFirmwarePaths}, + ), nil +} + +func getUTSRelease() (string, error) { + utsname := &unix.Utsname{} + if err := unix.Uname(utsname); err != nil { + return "", err + } + return unix.ByteSliceToString(utsname.Release[:]), nil +} + +func getFirmwareSearchPaths(logger logger.Interface) ([]string, error) { + + var firmwarePaths []string + if p := getCustomFirmwareClassPath(logger); p != "" { + logger.Debugf("using custom firmware class path: %s", p) + firmwarePaths = append(firmwarePaths, p) + } + + utsRelease, err := getUTSRelease() + if err != nil { + return nil, fmt.Errorf("failed to get UTS_RELEASE: %v", err) + } + + standardPaths := []string{ + filepath.Join("/lib/firmware/updates/", utsRelease), + "/lib/firmware/updates/", + filepath.Join("/lib/firmware/", utsRelease), + "/lib/firmware/", + } + + return append(firmwarePaths, standardPaths...), nil +} + +// getCustomFirmwareClassPath returns the custom firmware class path if it exists. +func getCustomFirmwareClassPath(logger logger.Interface) string { + customFirmwareClassPath, err := os.ReadFile("/sys/module/firmware_class/parameters/path") + if err != nil { + logger.Warningf("failed to get custom firmware class path: %v", err) + return "" + } + + return strings.TrimSpace(string(customFirmwareClassPath)) +} From 61057e63d1711bd2f598048960f643163ca1bd33 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 16 Sep 2024 15:43:23 +0200 Subject: [PATCH 8/8] Skip injection of GSP firmware by default This change skips the injection of GSP firmware by default. An include-gsp-firmware feature flag is added to allow the firmware paths to be injected if required. Signed-off-by: Evan Lezar --- cmd/nvidia-container-runtime-hook/main.go | 4 ++ cmd/nvidia-ctk/cdi/generate/generate.go | 7 +++ internal/config/features.go | 5 ++ internal/modifier/cdi.go | 1 + pkg/nvcdi/driver-nvml.go | 65 +---------------------- pkg/nvcdi/firmware.go | 2 +- pkg/nvcdi/lib.go | 4 +- 7 files changed, 22 insertions(+), 66 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index 90e488d1..6b14496f 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -90,6 +90,10 @@ func doPrestart() { args := []string{getCLIPath(cli)} + // Only include GSP firmware if explicitly renabled. + if !hook.Features.IncludeGSPFirmware.IsEnabled() { + args = append(args, "--no-gsp-firmware") + } // Only include the nvidia-persistenced socket if it is explicitly enabled. if !hook.Features.IncludePersistencedSocket.IsEnabled() { args = append(args, "--no-persistenced") diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index 9c2da04f..52623a9f 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -61,6 +61,7 @@ type options struct { ignorePatterns cli.StringSlice } + includeGSPFirmware bool includePersistencedSocket bool } @@ -171,6 +172,11 @@ func (m command) build() *cli.Command { Usage: "Specify a pattern the CSV mount specifications.", Destination: &opts.csv.ignorePatterns, }, + &cli.BoolFlag{ + Name: "include-gsp-firmware", + Usage: "Include the GSP firmware in the generated CDI specification.", + Destination: &opts.includeGSPFirmware, + }, &cli.BoolFlag{ Name: "include-persistenced-socket", Usage: "Include the nvidia-persistenced socket in the generated CDI specification.", @@ -280,6 +286,7 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) { nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()), nvcdi.WithCSVFiles(opts.csv.files.Value()), nvcdi.WithCSVIgnorePatterns(opts.csv.ignorePatterns.Value()), + nvcdi.WithOptInFeature("include-gsp-firmware", opts.includeGSPFirmware), nvcdi.WithOptInFeature("include-persistenced-socket", opts.includePersistencedSocket), ) if err != nil { diff --git a/internal/config/features.go b/internal/config/features.go index bde12191..e97a94d7 100644 --- a/internal/config/features.go +++ b/internal/config/features.go @@ -24,6 +24,7 @@ const ( FeatureNVSWITCH = featureName("nvswitch") FeatureGDRCopy = featureName("gdrcopy") FeatureAllowLDConfigFromContainer = featureName("allow-ldconfig-from-container") + FeatureIncludeGSPFirmware = featureName("include-gsp-firmware") FeatureIncludePersistencedSocket = featureName("include-persistenced-socket") ) @@ -37,6 +38,8 @@ type features struct { // If this feature flag is not set to 'true' only host-rooted config paths // (i.e. paths starting with an '@' are considered valid) AllowLDConfigFromContainer *feature `toml:"allow-ldconfig-from-container,omitempty"` + // IncludeGSPFirmware enables the injection of GSP firmware into containers. + IncludeGSPFirmware *feature `toml:"include-gsp-firmware,omitempty"` // IncludePersistencedSocket enables the injection of the nvidia-persistenced // socket into containers. IncludePersistencedSocket *feature `toml:"include-persistenced-socket,omitempty"` @@ -61,6 +64,8 @@ func (fs features) IsEnabledInEnvironment(n featureName, in ...getenver) bool { // Features without envvar overrides case FeatureAllowLDConfigFromContainer: return fs.AllowLDConfigFromContainer.IsEnabled() + case FeatureIncludeGSPFirmware: + return fs.IncludeGSPFirmware.IsEnabled() case FeatureIncludePersistencedSocket: return fs.IncludePersistencedSocket.IsEnabled() default: diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index d3e4e935..64d8ddf8 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -189,6 +189,7 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root), nvcdi.WithVendor("runtime.nvidia.com"), nvcdi.WithClass("gpu"), + nvcdi.WithOptInFeature("include-gsp-firmware", cfg.Features.IncludeGSPFirmware.IsEnabled()), nvcdi.WithOptInFeature("include-persistenced-socket", cfg.Features.IncludePersistencedSocket.IsEnabled()), ) if err != nil { diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index 59ff91c2..29cc0d27 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -18,12 +18,10 @@ package nvcdi import ( "fmt" - "os" "path/filepath" "strings" "github.com/NVIDIA/go-nvml/pkg/nvml" - "golang.org/x/sys/unix" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" @@ -63,7 +61,7 @@ func (l *nvcdilib) newDriverVersionDiscoverer(version string) (discover.Discover return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err) } - firmwares, err := NewDriverFirmwareDiscoverer(l.logger, l.driver.Root, version) + firmwares, err := l.newDriverFirmwareDiscoverer(l.logger, l.driver.Root, version) if err != nil { return nil, fmt.Errorf("failed to create discoverer for GSP firmware: %v", err) } @@ -107,67 +105,6 @@ func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nv return d, nil } -func getUTSRelease() (string, error) { - utsname := &unix.Utsname{} - if err := unix.Uname(utsname); err != nil { - return "", err - } - return unix.ByteSliceToString(utsname.Release[:]), nil -} - -func getFirmwareSearchPaths(logger logger.Interface) ([]string, error) { - - var firmwarePaths []string - if p := getCustomFirmwareClassPath(logger); p != "" { - logger.Debugf("using custom firmware class path: %s", p) - firmwarePaths = append(firmwarePaths, p) - } - - utsRelease, err := getUTSRelease() - if err != nil { - return nil, fmt.Errorf("failed to get UTS_RELEASE: %v", err) - } - - standardPaths := []string{ - filepath.Join("/lib/firmware/updates/", utsRelease), - "/lib/firmware/updates/", - filepath.Join("/lib/firmware/", utsRelease), - "/lib/firmware/", - } - - return append(firmwarePaths, standardPaths...), nil -} - -// getCustomFirmwareClassPath returns the custom firmware class path if it exists. -func getCustomFirmwareClassPath(logger logger.Interface) string { - customFirmwareClassPath, err := os.ReadFile("/sys/module/firmware_class/parameters/path") - if err != nil { - logger.Warningf("failed to get custom firmware class path: %v", err) - return "" - } - - return strings.TrimSpace(string(customFirmwareClassPath)) -} - -// NewDriverFirmwareDiscoverer creates a discoverer for GSP firmware associated with the specified driver version. -func NewDriverFirmwareDiscoverer(logger logger.Interface, driverRoot string, version string) (discover.Discover, error) { - gspFirmwareSearchPaths, err := getFirmwareSearchPaths(logger) - if err != nil { - return nil, fmt.Errorf("failed to get firmware search paths: %v", err) - } - gspFirmwarePaths := filepath.Join("nvidia", version, "gsp*.bin") - return discover.NewMounts( - logger, - lookup.NewFileLocator( - lookup.WithLogger(logger), - lookup.WithRoot(driverRoot), - lookup.WithSearchPaths(gspFirmwareSearchPaths...), - ), - driverRoot, - []string{gspFirmwarePaths}, - ), nil -} - // NewDriverBinariesDiscoverer creates a discoverer for GSP firmware associated with the GPU driver. func NewDriverBinariesDiscoverer(logger logger.Interface, driverRoot string) discover.Discover { return discover.NewMounts( diff --git a/pkg/nvcdi/firmware.go b/pkg/nvcdi/firmware.go index 7d96422d..8c3b4ee0 100644 --- a/pkg/nvcdi/firmware.go +++ b/pkg/nvcdi/firmware.go @@ -31,7 +31,7 @@ import ( // newDriverFirmwareDiscoverer creates a discoverer for GSP firmware associated with the specified driver version. func (l *nvcdilib) newDriverFirmwareDiscoverer(logger logger.Interface, driverRoot string, version string) (discover.Discover, error) { - if !l.optInFeatures["allow-gsp-firmware"] { + if !l.optInFeatures["include-gsp-firmware"] { return discover.None{}, nil } diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index e535763b..2ca6e6b1 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -69,7 +69,9 @@ type nvcdilib struct { // New creates a new nvcdi library func New(opts ...Option) (Interface, error) { - l := &nvcdilib{} + l := &nvcdilib{ + optInFeatures: make(map[string]bool), + } for _, opt := range opts { opt(l) }