Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RHINENG-13952: add rhel ai gpu model fields to sp #404

Merged
merged 3 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions dev/test-archives/core-base/data/insights_commands/lspci_-k
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
00:00.0 Host bridge: Advanced Micro Devices, Inc. [AMD] Starship/Matisse Root Complex
Subsystem: Dell Device 08ff
07:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Arcturus GL-XL [Instinct MI100] (rev 01)
Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34
Kernel driver in use: amdgpu
Kernel modules: amdgpu
08:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Arcturus GL-XL [Instinct MI100] (rev 01)
Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34
Kernel driver in use: amdgpu
Kernel modules: amdgpu
0a:00.0 Processing accelerators: Habana Labs Ltd. Device 1020 (rev 01)
Subsystem: Habana Labs Ltd. Device 1020
Kernel driver in use: habanalabs
Kernel modules: habanalabs
41 changes: 41 additions & 0 deletions dev/test-archives/core-base/data/insights_commands/lspci_-vmmkn
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
Slot: 00:00.0
Class: 0600
Vendor: 1022
Device: 1480
SVendor: 1028
SDevice: 08ff
NUMANode: 0

Slot: 07:00.0
Class: 0380
Vendor: 1002
Device: 738c
SVendor: 1002
SDevice: 0c34
Rev: 01
Driver: amdgpu
Module: amdgpu
NUMANode: 0

Slot: 08:00.0
Class: 0380
Vendor: 1002
Device: 738c
SVendor: 1002
SDevice: 0c34
Rev: 01
Driver: amdgpu
Module: amdgpu
NUMANode: 1

Slot: 0a:00.0
Class: 1200
Vendor: 1da3
Device: 1020
SVendor: 1da3
SDevice: 1020
PhySlot: 100
Rev: 01
Driver: habanalabs
Module: habanalabs
NUMANode: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"name": "insights.specs.Specs.lspci", "exec_time": 9.298324584960938e-05, "errors": [], "results": {"type": "insights.core.spec_factory.CommandOutputProvider", "object": {"rc": null, "cmd": "/sbin/lspci -k", "args": null, "save_as": false, "relative_path": "insights_commands/lspci_-k"}}, "ser_time": 0.1908283233642578}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"name": "insights.specs.Specs.lspci_vmmkn", "exec_time": 8.606910705566406e-05, "errors": [], "results": {"type": "insights.core.spec_factory.CommandOutputProvider", "object": {"rc": null, "cmd": "/sbin/lspci -vmmkn", "args": null, "save_as": false, "relative_path": "insights_commands/lspci_-vmmkn"}}, "ser_time": 0.20440101623535156}
28 changes: 28 additions & 0 deletions src/puptoo/process/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from insights import make_metadata, rule, run
from insights.combiners.ansible_info import AnsibleInfo
from insights.combiners.cloud_provider import CloudProvider
from insights.combiners.lspci import LsPci
from insights.combiners.os_release import OSRelease
from insights.combiners.redhat_release import RedHatRelease
from insights.combiners.sap import Sap
Expand Down Expand Up @@ -91,6 +92,18 @@ def catch_error(parser, error):
"bp-63a5400a",
])

RHEL_AI_GPU_MODEL_IDENTIFIERS = {
"AMD_GPU": {
"VENDOR_ID": "1002",
"DEVICE_ID": set(['740f', '740c', '7408', '738e', '738c',
'686c', '6864', '6860', '66a1', '66a0'])
},
"INTEL_GAUDI_HPU": {
"VENDOR_ID": "1da3",
"DEVICE_ID": set(['1020', '1010', '1000', '0030', '0001'])
}
}


@rule(
optional=[
Expand Down Expand Up @@ -152,6 +165,7 @@ def catch_error(parser, error):
FalconctlBackend,
FalconctlVersion,
NvidiaSmiL,
LsPci,
EAPJSONReports,
ImageBuilderFacts,
]
Expand Down Expand Up @@ -215,6 +229,7 @@ def system_profile(
falconctl_backend,
falconctl_version,
nvidia_smi_l,
lspci,
eap_json_reports,
image_builder_facts,
):
Expand Down Expand Up @@ -770,6 +785,19 @@ def system_profile(
}
if nvidia_smi_l:
rhel_ai_profile["nvidia_gpu_models"] = [gpu["model"] for gpu in nvidia_smi_l]
if lspci:
rhel_ai_profile["amd_gpu_models"] = []
rhel_ai_profile["intel_gaudi_hpu_models"] = []
for pci in lspci:
subsystem = pci.get("Subsystem")
if (subsystem and
pci.get("Vendor") == RHEL_AI_GPU_MODEL_IDENTIFIERS["AMD_GPU"]["VENDOR_ID"] and
pci.get("Device") in RHEL_AI_GPU_MODEL_IDENTIFIERS["AMD_GPU"]["DEVICE_ID"]):
rhel_ai_profile["amd_gpu_models"].append(subsystem)
elif (subsystem and
pci.get("Vendor") == RHEL_AI_GPU_MODEL_IDENTIFIERS["INTEL_GAUDI_HPU"]["VENDOR_ID"] and
pci.get("Device") in RHEL_AI_GPU_MODEL_IDENTIFIERS["INTEL_GAUDI_HPU"]["DEVICE_ID"]):
rhel_ai_profile["intel_gaudi_hpu_models"].append(subsystem)
profile["rhel_ai"] = _remove_empties(rhel_ai_profile)

if image_builder_facts:
Expand Down
118 changes: 117 additions & 1 deletion tests/test_rhel_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from insights.tests import InputData, run_test
from src.puptoo.process.profile import system_profile


OS_RELEASE_RHEL_AI = """
NAME="Red Hat Enterprise Linux"
VERSION="9.20240630.0.4 (Plow)"
Expand Down Expand Up @@ -72,6 +71,72 @@
GPU 1: Tesla V100-PCIE-16GB (UUID: GPU-b08ecee0-0ea5-7b07-d459-baa5b95f5e89)
""".strip()

LSPCI_K_AMD = """
00:00.0 Host bridge: Advanced Micro Devices, Inc. [AMD] Starship/Matisse Root Complex
Subsystem: Dell Device 08ff
07:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Arcturus GL-XL [Instinct MI100] (rev 01)
Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34
Kernel driver in use: amdgpu
Kernel modules: amdgpu
08:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Arcturus GL-XL [Instinct MI100] (rev 01)
Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34
Kernel driver in use: amdgpu
Kernel modules: amdgpu
""".strip()

LSPCI_VMMKN_AMD = """
Slot: 00:00.0
Class: 0600
Vendor: 1022
Device: 1480
SVendor: 1028
SDevice: 08ff
NUMANode: 0

Slot: 07:00.0
Class: 0380
Vendor: 1002
Device: 738c
SVendor: 1002
SDevice: 0c34
Rev: 01
Driver: amdgpu
Module: amdgpu
NUMANode: 0

Slot: 08:00.0
Class: 0380
Vendor: 1002
Device: 738c
SVendor: 1002
SDevice: 0c34
Rev: 01
Driver: amdgpu
Module: amdgpu
NUMANode: 1
""".strip()

LSPCI_K_INTEL_GAUDI = """
0a:00.0 Processing accelerators: Habana Labs Ltd. Device 1020 (rev 01)
Subsystem: Habana Labs Ltd. Device 1020
Kernel driver in use: habanalabs
Kernel modules: habanalabs
""".strip()

LSPCI_VMMKN_INTEL_GAUDI = """
Slot: 0a:00.0
Class: 1200
Vendor: 1da3
Device: 1020
SVendor: 1da3
SDevice: 1020
PhySlot: 100
Rev: 01
Driver: habanalabs
Module: habanalabs
NUMANode: 0
""".strip()


def test_rhel_ai():

Expand Down Expand Up @@ -107,3 +172,54 @@ def test_rhel_ai():
input_data.add(Specs.nvidia_smi_l, NVIDIA_SMI_L)
result = run_test(system_profile, input_data)
assert "rhel_ai" not in result

# As a RHEL AI system, with amd_gpu_models
input_data = InputData()
input_data.add(Specs.os_release, OS_RELEASE_RHEL_AI)
input_data.add(Specs.lspci, LSPCI_K_AMD)
input_data.add(Specs.lspci_vmmkn, LSPCI_VMMKN_AMD)
result = run_test(system_profile, input_data)
assert result["rhel_ai"]["variant"] == "RHEL AI"
assert result["rhel_ai"]["rhel_ai_version_id"] == "v1.1.3"
assert len(result["rhel_ai"]["amd_gpu_models"]) == 2
assert result["rhel_ai"]["amd_gpu_models"] == [
"Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34",
"Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34"]

# As a RHEL AI system, with intel_gaudi_hpu_models
input_data = InputData()
input_data.add(Specs.os_release, OS_RELEASE_RHEL_AI)
input_data.add(Specs.lspci, LSPCI_K_INTEL_GAUDI)
input_data.add(Specs.lspci_vmmkn, LSPCI_VMMKN_INTEL_GAUDI)
result = run_test(system_profile, input_data)
assert result["rhel_ai"]["variant"] == "RHEL AI"
assert result["rhel_ai"]["rhel_ai_version_id"] == "v1.1.3"
assert len(result["rhel_ai"]["intel_gaudi_hpu_models"]) == 1
assert result["rhel_ai"]["intel_gaudi_hpu_models"] == ["Habana Labs Ltd. Device 1020"]

# As a RHEL AI system, with both amd_gpu_models and intel_gaudi_hpu_models
input_data = InputData()
input_data.add(Specs.os_release, OS_RELEASE_RHEL_AI)
input_data.add(Specs.lspci, "\n".join([LSPCI_K_AMD, LSPCI_K_INTEL_GAUDI]))
input_data.add(Specs.lspci_vmmkn, "\n\n".join([LSPCI_VMMKN_AMD, LSPCI_VMMKN_INTEL_GAUDI]))
result = run_test(system_profile, input_data)
assert result["rhel_ai"]["variant"] == "RHEL AI"
assert result["rhel_ai"]["rhel_ai_version_id"] == "v1.1.3"
assert result["rhel_ai"]["amd_gpu_models"] == [
"Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34",
"Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34"]
assert result["rhel_ai"]["intel_gaudi_hpu_models"] == ["Habana Labs Ltd. Device 1020"]

# As a RHEL AI system, with both nvidia_gpu_models and amd_gpu_models
input_data = InputData()
input_data.add(Specs.os_release, OS_RELEASE_RHEL_AI)
input_data.add(Specs.nvidia_smi_l, NVIDIA_SMI_L)
input_data.add(Specs.lspci, LSPCI_K_AMD)
input_data.add(Specs.lspci_vmmkn, LSPCI_VMMKN_AMD)
result = run_test(system_profile, input_data)
assert result["rhel_ai"]["variant"] == "RHEL AI"
assert result["rhel_ai"]["rhel_ai_version_id"] == "v1.1.3"
assert result["rhel_ai"]["nvidia_gpu_models"] == ["NVIDIA T1000", "Tesla V100-PCIE-16GB"]
assert result["rhel_ai"]["amd_gpu_models"] == [
"Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34",
"Advanced Micro Devices, Inc. [AMD/ATI] Device 0c34"]
Loading