diff --git a/dev/test-archives/core-base/data/insights_commands/lspci_-k b/dev/test-archives/core-base/data/insights_commands/lspci_-k new file mode 100644 index 00000000..5d176ac5 --- /dev/null +++ b/dev/test-archives/core-base/data/insights_commands/lspci_-k @@ -0,0 +1,14 @@ +00:00.0 Host bridge: Advanced Micro Devices, Inc. [AMD] Starship/Matisse Root Complex + Subsystem: Dell Device 08ff +07:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Arcturus GL-XL [Instinct MI100] (rev 01) + Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34 + Kernel driver in use: amdgpu + Kernel modules: amdgpu +08:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Arcturus GL-XL [Instinct MI100] (rev 01) + Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34 + Kernel driver in use: amdgpu + Kernel modules: amdgpu +0a:00.0 Processing accelerators: Habana Labs Ltd. Device 1020 (rev 01) + Subsystem: Habana Labs Ltd. Device 1020 + Kernel driver in use: habanalabs + Kernel modules: habanalabs \ No newline at end of file diff --git a/dev/test-archives/core-base/data/insights_commands/lspci_-vmmkn b/dev/test-archives/core-base/data/insights_commands/lspci_-vmmkn new file mode 100644 index 00000000..3393c78f --- /dev/null +++ b/dev/test-archives/core-base/data/insights_commands/lspci_-vmmkn @@ -0,0 +1,41 @@ +Slot: 00:00.0 +Class: 0600 +Vendor: 1022 +Device: 1480 +SVendor: 1028 +SDevice: 08ff +NUMANode: 0 + +Slot: 07:00.0 +Class: 0380 +Vendor: 1002 +Device: 738c +SVendor: 1002 +SDevice: 0c34 +Rev: 01 +Driver: amdgpu +Module: amdgpu +NUMANode: 0 + +Slot: 08:00.0 +Class: 0380 +Vendor: 1002 +Device: 738c +SVendor: 1002 +SDevice: 0c34 +Rev: 01 +Driver: amdgpu +Module: amdgpu +NUMANode: 1 + +Slot: 0a:00.0 +Class: 1200 +Vendor: 1da3 +Device: 1020 +SVendor: 1da3 +SDevice: 1020 +PhySlot: 100 +Rev: 01 +Driver: habanalabs +Module: habanalabs +NUMANode: 0 \ No newline at end of file diff --git a/dev/test-archives/core-base/meta_data/insights.specs.Specs.lspci.json b/dev/test-archives/core-base/meta_data/insights.specs.Specs.lspci.json new file mode 100644 index 00000000..d09520ca --- /dev/null +++ b/dev/test-archives/core-base/meta_data/insights.specs.Specs.lspci.json @@ -0,0 +1 @@ +{"name": "insights.specs.Specs.lspci", "exec_time": 9.298324584960938e-05, "errors": [], "results": {"type": "insights.core.spec_factory.CommandOutputProvider", "object": {"rc": null, "cmd": "/sbin/lspci -k", "args": null, "save_as": false, "relative_path": "insights_commands/lspci_-k"}}, "ser_time": 0.1908283233642578} \ No newline at end of file diff --git a/dev/test-archives/core-base/meta_data/insights.specs.Specs.lspci_vmmkn.json b/dev/test-archives/core-base/meta_data/insights.specs.Specs.lspci_vmmkn.json new file mode 100644 index 00000000..faf2f101 --- /dev/null +++ b/dev/test-archives/core-base/meta_data/insights.specs.Specs.lspci_vmmkn.json @@ -0,0 +1 @@ +{"name": "insights.specs.Specs.lspci_vmmkn", "exec_time": 8.606910705566406e-05, "errors": [], "results": {"type": "insights.core.spec_factory.CommandOutputProvider", "object": {"rc": null, "cmd": "/sbin/lspci -vmmkn", "args": null, "save_as": false, "relative_path": "insights_commands/lspci_-vmmkn"}}, "ser_time": 0.20440101623535156} \ No newline at end of file diff --git a/src/puptoo/process/profile.py b/src/puptoo/process/profile.py index b196027f..1ed5c30e 100644 --- a/src/puptoo/process/profile.py +++ b/src/puptoo/process/profile.py @@ -5,6 +5,7 @@ from insights import make_metadata, rule, run from insights.combiners.ansible_info import AnsibleInfo from insights.combiners.cloud_provider import CloudProvider +from insights.combiners.lspci import LsPci from insights.combiners.os_release import OSRelease from insights.combiners.redhat_release import RedHatRelease from insights.combiners.sap import Sap @@ -91,6 +92,18 @@ def catch_error(parser, error): "bp-63a5400a", ]) +RHEL_AI_GPU_MODEL_IDENTIFIERS = { + "AMD_GPU": { + "VENDOR_ID": "1002", + "DEVICE_ID": set(['740f', '740c', '7408', '738e', '738c', + '686c', '6864', '6860', '66a1', '66a0']) + }, + "INTEL_GAUDI_HPU": { + "VENDOR_ID": "1da3", + "DEVICE_ID": set(['1020', '1010', '1000', '0030', '0001']) + } +} + @rule( optional=[ @@ -152,6 +165,7 @@ def catch_error(parser, error): FalconctlBackend, FalconctlVersion, NvidiaSmiL, + LsPci, EAPJSONReports, ImageBuilderFacts, ] @@ -215,6 +229,7 @@ def system_profile( falconctl_backend, falconctl_version, nvidia_smi_l, + lspci, eap_json_reports, image_builder_facts, ): @@ -770,6 +785,19 @@ def system_profile( } if nvidia_smi_l: rhel_ai_profile["nvidia_gpu_models"] = [gpu["model"] for gpu in nvidia_smi_l] + if lspci: + rhel_ai_profile["amd_gpu_models"] = [] + rhel_ai_profile["intel_gaudi_hpu_models"] = [] + for pci in lspci: + subsystem = pci.get("Subsystem") + if (subsystem and + pci.get("Vendor") == RHEL_AI_GPU_MODEL_IDENTIFIERS["AMD_GPU"]["VENDOR_ID"] and + pci.get("Device") in RHEL_AI_GPU_MODEL_IDENTIFIERS["AMD_GPU"]["DEVICE_ID"]): + rhel_ai_profile["amd_gpu_models"].append(subsystem) + elif (subsystem and + pci.get("Vendor") == RHEL_AI_GPU_MODEL_IDENTIFIERS["INTEL_GAUDI_HPU"]["VENDOR_ID"] and + pci.get("Device") in RHEL_AI_GPU_MODEL_IDENTIFIERS["INTEL_GAUDI_HPU"]["DEVICE_ID"]): + rhel_ai_profile["intel_gaudi_hpu_models"].append(subsystem) profile["rhel_ai"] = _remove_empties(rhel_ai_profile) if image_builder_facts: diff --git a/tests/test_rhel_ai.py b/tests/test_rhel_ai.py index 99c13e98..dd76c0e4 100644 --- a/tests/test_rhel_ai.py +++ b/tests/test_rhel_ai.py @@ -2,7 +2,6 @@ from insights.tests import InputData, run_test from src.puptoo.process.profile import system_profile - OS_RELEASE_RHEL_AI = """ NAME="Red Hat Enterprise Linux" VERSION="9.20240630.0.4 (Plow)" @@ -72,6 +71,72 @@ GPU 1: Tesla V100-PCIE-16GB (UUID: GPU-b08ecee0-0ea5-7b07-d459-baa5b95f5e89) """.strip() +LSPCI_K_AMD = """ +00:00.0 Host bridge: Advanced Micro Devices, Inc. [AMD] Starship/Matisse Root Complex + Subsystem: Dell Device 08ff +07:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Arcturus GL-XL [Instinct MI100] (rev 01) + Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34 + Kernel driver in use: amdgpu + Kernel modules: amdgpu +08:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Arcturus GL-XL [Instinct MI100] (rev 01) + Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34 + Kernel driver in use: amdgpu + Kernel modules: amdgpu +""".strip() + +LSPCI_VMMKN_AMD = """ +Slot: 00:00.0 +Class: 0600 +Vendor: 1022 +Device: 1480 +SVendor: 1028 +SDevice: 08ff +NUMANode: 0 + +Slot: 07:00.0 +Class: 0380 +Vendor: 1002 +Device: 738c +SVendor: 1002 +SDevice: 0c34 +Rev: 01 +Driver: amdgpu +Module: amdgpu +NUMANode: 0 + +Slot: 08:00.0 +Class: 0380 +Vendor: 1002 +Device: 738c +SVendor: 1002 +SDevice: 0c34 +Rev: 01 +Driver: amdgpu +Module: amdgpu +NUMANode: 1 +""".strip() + +LSPCI_K_INTEL_GAUDI = """ +0a:00.0 Processing accelerators: Habana Labs Ltd. Device 1020 (rev 01) + Subsystem: Habana Labs Ltd. Device 1020 + Kernel driver in use: habanalabs + Kernel modules: habanalabs +""".strip() + +LSPCI_VMMKN_INTEL_GAUDI = """ +Slot: 0a:00.0 +Class: 1200 +Vendor: 1da3 +Device: 1020 +SVendor: 1da3 +SDevice: 1020 +PhySlot: 100 +Rev: 01 +Driver: habanalabs +Module: habanalabs +NUMANode: 0 +""".strip() + def test_rhel_ai(): @@ -107,3 +172,54 @@ def test_rhel_ai(): input_data.add(Specs.nvidia_smi_l, NVIDIA_SMI_L) result = run_test(system_profile, input_data) assert "rhel_ai" not in result + + # As a RHEL AI system, with amd_gpu_models + input_data = InputData() + input_data.add(Specs.os_release, OS_RELEASE_RHEL_AI) + input_data.add(Specs.lspci, LSPCI_K_AMD) + input_data.add(Specs.lspci_vmmkn, LSPCI_VMMKN_AMD) + result = run_test(system_profile, input_data) + assert result["rhel_ai"]["variant"] == "RHEL AI" + assert result["rhel_ai"]["rhel_ai_version_id"] == "v1.1.3" + assert len(result["rhel_ai"]["amd_gpu_models"]) == 2 + assert result["rhel_ai"]["amd_gpu_models"] == [ + "Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34", + "Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34"] + + # As a RHEL AI system, with intel_gaudi_hpu_models + input_data = InputData() + input_data.add(Specs.os_release, OS_RELEASE_RHEL_AI) + input_data.add(Specs.lspci, LSPCI_K_INTEL_GAUDI) + input_data.add(Specs.lspci_vmmkn, LSPCI_VMMKN_INTEL_GAUDI) + result = run_test(system_profile, input_data) + assert result["rhel_ai"]["variant"] == "RHEL AI" + assert result["rhel_ai"]["rhel_ai_version_id"] == "v1.1.3" + assert len(result["rhel_ai"]["intel_gaudi_hpu_models"]) == 1 + assert result["rhel_ai"]["intel_gaudi_hpu_models"] == ["Habana Labs Ltd. Device 1020"] + + # As a RHEL AI system, with both amd_gpu_models and intel_gaudi_hpu_models + input_data = InputData() + input_data.add(Specs.os_release, OS_RELEASE_RHEL_AI) + input_data.add(Specs.lspci, "\n".join([LSPCI_K_AMD, LSPCI_K_INTEL_GAUDI])) + input_data.add(Specs.lspci_vmmkn, "\n\n".join([LSPCI_VMMKN_AMD, LSPCI_VMMKN_INTEL_GAUDI])) + result = run_test(system_profile, input_data) + assert result["rhel_ai"]["variant"] == "RHEL AI" + assert result["rhel_ai"]["rhel_ai_version_id"] == "v1.1.3" + assert result["rhel_ai"]["amd_gpu_models"] == [ + "Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34", + "Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34"] + assert result["rhel_ai"]["intel_gaudi_hpu_models"] == ["Habana Labs Ltd. Device 1020"] + + # As a RHEL AI system, with both nvidia_gpu_models and amd_gpu_models + input_data = InputData() + input_data.add(Specs.os_release, OS_RELEASE_RHEL_AI) + input_data.add(Specs.nvidia_smi_l, NVIDIA_SMI_L) + input_data.add(Specs.lspci, LSPCI_K_AMD) + input_data.add(Specs.lspci_vmmkn, LSPCI_VMMKN_AMD) + result = run_test(system_profile, input_data) + assert result["rhel_ai"]["variant"] == "RHEL AI" + assert result["rhel_ai"]["rhel_ai_version_id"] == "v1.1.3" + assert result["rhel_ai"]["nvidia_gpu_models"] == ["NVIDIA T1000", "Tesla V100-PCIE-16GB"] + assert result["rhel_ai"]["amd_gpu_models"] == [ + "Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34", + "Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34"]