From 1fb5838f2199be1b6b1dd049cd1f957e5ca08e1d Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Thu, 19 Dec 2024 11:08:57 +0100 Subject: [PATCH] Add K3s-QEMU-SNP-GPU platform This adds a new `K3s-QEMU-SNP-GPU` platform that can be used for testing GPU-specific features on our on-prem infrastructure that uses this setup. This intentionally does not yet add any specific actions done if that platform is selected. --- .github/workflows/e2e_manual.yml | 2 +- cli/genpolicy/config.go | 2 +- cli/main.go | 2 +- e2e/internal/contrasttest/contrasttest.go | 4 ++-- internal/kuberesource/parts.go | 2 +- internal/manifest/referencevalues.go | 4 ++-- internal/platforms/platforms.go | 8 +++++++- justfile | 12 ++++++------ nodeinstaller/internal/config/kata_runtime_test.go | 2 +- nodeinstaller/internal/constants/constants.go | 4 ++-- nodeinstaller/node-installer.go | 6 +++--- nodeinstaller/node-installer_test.go | 4 ++++ packages/by-name/contrast/package.nix | 2 ++ packages/scripts.nix | 2 +- 14 files changed, 34 insertions(+), 22 deletions(-) diff --git a/.github/workflows/e2e_manual.yml b/.github/workflows/e2e_manual.yml index 13beda09c9..be65db14dd 100644 --- a/.github/workflows/e2e_manual.yml +++ b/.github/workflows/e2e_manual.yml @@ -46,7 +46,7 @@ jobs: echo "runner=ubuntu-22.04" >> "$GITHUB_OUTPUT" echo "self-hosted=false" >> "$GITHUB_OUTPUT" ;; - "K3s-QEMU-SNP") + "K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU") echo "runner=SNP" >> "$GITHUB_OUTPUT" echo "self-hosted=true" >> "$GITHUB_OUTPUT" ;; diff --git a/cli/genpolicy/config.go b/cli/genpolicy/config.go index 5b479580ea..327f520f6b 100644 --- a/cli/genpolicy/config.go +++ b/cli/genpolicy/config.go @@ -43,7 +43,7 @@ func NewConfig(platform platforms.Platform) *Config { Settings: aksSettings, Bin: aksGenpolicyBin, } - case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX: + case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX: return &Config{ Rules: kataRules, Settings: kataSettings, diff --git a/cli/main.go b/cli/main.go index 63cc5671b5..e2a2c61833 100644 --- a/cli/main.go +++ b/cli/main.go @@ -105,7 +105,7 @@ func buildVersionString() (string, error) { switch platform { case platforms.AKSCloudHypervisorSNP: fmt.Fprintf(versionsWriter, "\tgenpolicy version:\t%s\n", constants.MicrosoftGenpolicyVersion) - case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX: + case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX: fmt.Fprintf(versionsWriter, "\tgenpolicy version:\t%s\n", constants.KataGenpolicyVersion) } } diff --git a/e2e/internal/contrasttest/contrasttest.go b/e2e/internal/contrasttest/contrasttest.go index 96e13e37d6..7ed77bc29e 100644 --- a/e2e/internal/contrasttest/contrasttest.go +++ b/e2e/internal/contrasttest/contrasttest.go @@ -202,7 +202,7 @@ func (ct *ContrastTest) patchReferenceValues(t *testing.T, platform platforms.Pl SNPVersion: toPtr(manifest.SVN(255)), MicrocodeVersion: toPtr(manifest.SVN(255)), } - case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP: + case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU: // The generate command doesn't fill in all required fields when // generating a manifest for baremetal SNP. Do that now. for i, snp := range m.ReferenceValues.SNP { @@ -372,7 +372,7 @@ func (ct *ContrastTest) FactorPlatformTimeout(timeout time.Duration) time.Durati switch ct.Platform { case platforms.AKSCloudHypervisorSNP: // AKS defined is the baseline return timeout - case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX: + case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX: return 2 * timeout default: return timeout diff --git a/internal/kuberesource/parts.go b/internal/kuberesource/parts.go index 7ec6e40bd9..a47754263c 100644 --- a/internal/kuberesource/parts.go +++ b/internal/kuberesource/parts.go @@ -136,7 +136,7 @@ func NodeInstaller(namespace string, platform platforms.Platform) (*NodeInstalle WithType(corev1.HostPathDirectory), )) snapshotterVolumes = nydusSnapshotterVolumes - case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.RKE2QEMUTDX: + case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX: nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-kata:latest" snapshotter = nydusSnapshotter nydusSnapshotterVolumes = append(nydusSnapshotterVolumes, Volume(). diff --git a/internal/manifest/referencevalues.go b/internal/manifest/referencevalues.go index dc423b2334..072d652251 100644 --- a/internal/manifest/referencevalues.go +++ b/internal/manifest/referencevalues.go @@ -143,11 +143,11 @@ func platformFromHandler(handler string) (platforms.Platform, error) { } parts := strings.Split(rest, "-") - if len(parts) != 4 { + if len(parts) != 4 && len(parts) != 5 { return platforms.Unknown, fmt.Errorf("invalid handler name: %s", handler) } - rawPlatform := fmt.Sprintf("%s-%s-%s", parts[0], parts[1], parts[2]) + rawPlatform := strings.Join(parts[:len(parts)-1], "-") platform, err := platforms.FromString(rawPlatform) if err != nil { diff --git a/internal/platforms/platforms.go b/internal/platforms/platforms.go index 106f6f7330..3c73f20e8b 100644 --- a/internal/platforms/platforms.go +++ b/internal/platforms/platforms.go @@ -28,11 +28,13 @@ const ( MetalQEMUSNP // MetalQEMUTDX is the generic platform for bare-metal TDX deployments. MetalQEMUTDX + // K3sQEMUSNPGPU represents a deployment with QEMU on bare-metal SNP K3s with GPU passthrough. + K3sQEMUSNPGPU ) // All returns a list of all available platforms. func All() []Platform { - return []Platform{AKSCloudHypervisorSNP, K3sQEMUTDX, K3sQEMUSNP, RKE2QEMUTDX, MetalQEMUSNP, MetalQEMUTDX} + return []Platform{AKSCloudHypervisorSNP, K3sQEMUTDX, K3sQEMUSNP, RKE2QEMUTDX, MetalQEMUSNP, MetalQEMUTDX, K3sQEMUSNPGPU} } // AllStrings returns a list of all available platforms as strings. @@ -53,6 +55,8 @@ func (p Platform) String() string { return "K3s-QEMU-TDX" case K3sQEMUSNP: return "K3s-QEMU-SNP" + case K3sQEMUSNPGPU: + return "K3s-QEMU-SNP-GPU" case RKE2QEMUTDX: return "RKE2-QEMU-TDX" case MetalQEMUSNP: @@ -73,6 +77,8 @@ func FromString(s string) (Platform, error) { return K3sQEMUTDX, nil case "k3s-qemu-snp": return K3sQEMUSNP, nil + case "k3s-qemu-snp-gpu": + return K3sQEMUSNPGPU, nil case "rke2-qemu-tdx": return RKE2QEMUTDX, nil case "metal-qemu-snp": diff --git a/justfile b/justfile index ea650a47c9..5f070c61ff 100644 --- a/justfile +++ b/justfile @@ -47,7 +47,7 @@ node-installer platform=default_platform: just push "tardev-snapshotter" just push "node-installer-microsoft" ;; - "Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX") + "Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX") just push "nydus-snapshotter" just push "node-installer-kata" ;; @@ -186,7 +186,7 @@ create-pre platform=default_platform: # TODO(burgerdev): this should create the resource group for consistency : ;; - "Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX") + "Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX") : ;; "AKS-PEER-SNP") @@ -215,7 +215,7 @@ create platform=default_platform: "AKS-CLH-SNP") nix run -L .#scripts.create-coco-aks -- --name="$azure_resource_group" --location="$azure_location" ;; - "Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX") + "Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX") : ;; "AKS-PEER-SNP") @@ -328,7 +328,7 @@ get-credentials platform=default_platform: "K3s-QEMU-TDX") nix run -L .#scripts.get-credentials "projects/796962942582/secrets/m50-ganondorf-kubeconf/versions/5" ;; - "K3s-QEMU-SNP") + "K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU") nix run -L .#scripts.get-credentials "projects/796962942582/secrets/discovery-kubeconf/versions/2" ;; *) @@ -352,7 +352,7 @@ destroy platform=default_platform: "AKS-CLH-SNP") nix run -L .#scripts.destroy-coco-aks -- --name="$azure_resource_group" ;; - "K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX") + "K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX") : ;; "AKS-PEER-SNP") @@ -377,7 +377,7 @@ destroy-post platform=default_platform: # TODO(burgerdev): this should destroy the resource group for consistency. : ;; - "K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX") + "K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX") : ;; "AKS-PEER-SNP") diff --git a/nodeinstaller/internal/config/kata_runtime_test.go b/nodeinstaller/internal/config/kata_runtime_test.go index 25e81fee9e..bfef090fb3 100644 --- a/nodeinstaller/internal/config/kata_runtime_test.go +++ b/nodeinstaller/internal/config/kata_runtime_test.go @@ -28,7 +28,7 @@ func TestKataConfig(t *testing.T) { assert.Contains(string(configBytes), "[Runtime]") switch platform { - case platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.RKE2QEMUTDX: + case platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.RKE2QEMUTDX: assert.Contains(string(configBytes), "[Hypervisor.qemu]") case platforms.AKSCloudHypervisorSNP: assert.Contains(string(configBytes), "[Hypervisor.clh]") diff --git a/nodeinstaller/internal/constants/constants.go b/nodeinstaller/internal/constants/constants.go index 106bc70655..de5d2dc751 100644 --- a/nodeinstaller/internal/constants/constants.go +++ b/nodeinstaller/internal/constants/constants.go @@ -75,7 +75,7 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer if debug { config.Hypervisor["qemu"]["enable_debug"] = true } - case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP: + case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU: if err := toml.Unmarshal([]byte(kataBareMetalQEMUSNPBaseConfig), &config); err != nil { return nil, fmt.Errorf("failed to unmarshal kata runtime configuration: %w", err) } @@ -133,7 +133,7 @@ func ContainerdRuntimeConfigFragment(baseDir, snapshotter string, platform platf cfg.Options = map[string]any{ "ConfigPath": filepath.Join(baseDir, "etc", "configuration-qemu-tdx.toml"), } - case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP: + case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU: cfg.Options = map[string]any{ "ConfigPath": filepath.Join(baseDir, "etc", "configuration-qemu-snp.toml"), } diff --git a/nodeinstaller/node-installer.go b/nodeinstaller/node-installer.go index e54f0cb98b..d4d2c52e25 100644 --- a/nodeinstaller/node-installer.go +++ b/nodeinstaller/node-installer.go @@ -113,7 +113,7 @@ func run(ctx context.Context, fetcher assetFetcher, platform platforms.Platform, case platforms.MetalQEMUTDX: kataConfigPath = filepath.Join(kataConfigPath, "configuration-qemu-tdx.toml") containerdConfigPath = filepath.Join(hostMount, "etc", "containerd", "config.toml") - case platforms.K3sQEMUSNP: + case platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU: kataConfigPath = filepath.Join(kataConfigPath, "configuration-qemu-snp.toml") containerdConfigPath = filepath.Join(hostMount, "var", "lib", "rancher", "k3s", "agent", "etc", "containerd", "config.toml.tmpl") case platforms.K3sQEMUTDX: @@ -147,7 +147,7 @@ func run(ctx context.Context, fetcher assetFetcher, platform platforms.Platform, switch platform { case platforms.AKSCloudHypervisorSNP, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX: return restartHostContainerd(containerdConfigPath, "containerd") - case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP: + case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU: if hostServiceExists("k3s") { return restartHostContainerd(containerdConfigPath, "k3s") } else if hostServiceExists("k3s-agent") { @@ -212,7 +212,7 @@ func patchContainerdConfig(runtimeHandler, basePath, configPath string, platform case platforms.AKSCloudHypervisorSNP: snapshotterName = fmt.Sprintf("tardev-%s", runtimeHandler) socketName = fmt.Sprintf("/run/containerd/tardev-snapshotter-%s.sock", runtimeHandler) - case platforms.MetalQEMUTDX, platforms.MetalQEMUSNP, platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.RKE2QEMUTDX: + case platforms.MetalQEMUTDX, platforms.MetalQEMUSNP, platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX: snapshotterName = fmt.Sprintf("nydus-%s", runtimeHandler) socketName = fmt.Sprintf("/run/containerd/containerd-nydus-grpc-%s.sock", runtimeHandler) diff --git a/nodeinstaller/node-installer_test.go b/nodeinstaller/node-installer_test.go index 79f80ec34a..a1d14a4fc4 100644 --- a/nodeinstaller/node-installer_test.go +++ b/nodeinstaller/node-installer_test.go @@ -42,6 +42,10 @@ func TestPatchContainerdConfig(t *testing.T) { platform: platforms.K3sQEMUSNP, expected: expectedConfBareMetalQEMUSNP, }, + "BareMetalQEMUSNPGPU": { + platform: platforms.K3sQEMUSNPGPU, + expected: expectedConfBareMetalQEMUSNP, + }, "Unknown": { platform: platforms.Unknown, wantErr: true, diff --git a/packages/by-name/contrast/package.nix b/packages/by-name/contrast/package.nix index 6f83e64134..8fdfa6ec79 100644 --- a/packages/by-name/contrast/package.nix +++ b/packages/by-name/contrast/package.nix @@ -57,6 +57,7 @@ let rke2-qemu-tdx-handler = runtimeHandler "rke2-qemu-tdx" kata.contrast-node-installer-image.runtimeHash; metal-qemu-snp-handler = runtimeHandler "metal-qemu-snp" kata.contrast-node-installer-image.runtimeHash; k3s-qemu-snp-handler = runtimeHandler "k3s-qemu-snp" kata.contrast-node-installer-image.runtimeHash; + k3s-qemu-snp-gpu-handler = runtimeHandler "k3s-qemu-snp-gpu" kata.contrast-node-installer-image.runtimeHash; aksRefVals = { snp = [ @@ -135,6 +136,7 @@ let "${rke2-qemu-tdx-handler}" = tdxRefVals; "${metal-qemu-snp-handler}" = snpRefVals; "${k3s-qemu-snp-handler}" = snpRefVals; + "${k3s-qemu-snp-gpu-handler}" = snpRefVals; } ); diff --git a/packages/scripts.nix b/packages/scripts.nix index 83ef304ba5..35b5a0edac 100644 --- a/packages/scripts.nix +++ b/packages/scripts.nix @@ -259,7 +259,7 @@ cp ${pkgs.microsoft.genpolicy.settings-coordinator}/genpolicy-settings.json . ${pkgs.microsoft.genpolicy}/bin/genpolicy < "$tmpdir/coordinator_base.yml" ;; - "k3s-qemu-snp"|"k3s-qemu-tdx"|"rke2-qemu-tdx") + "k3s-qemu-snp"|"k3s-qemu-snp-gpu"|"k3s-qemu-tdx"|"rke2-qemu-tdx") cp ${pkgs.kata.genpolicy.rules-coordinator}/genpolicy-rules.rego rules.rego cp ${pkgs.kata.genpolicy.settings-coordinator}/genpolicy-settings.json . ${pkgs.kata.genpolicy}/bin/genpolicy < "$tmpdir/coordinator_base.yml"