From af2f85ca9348c18dbb1371dacb48c6f533e01e54 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 24 Jun 2024 23:18:58 -0500 Subject: [PATCH] Add `logical_node_type_id` field to `rocprofiler_agent_t` (#948) * Add logical_node_type_id field to rocprofiler_agent_t * Patch queue_controller --- source/include/rocprofiler-sdk/agent.h | 16 +++++++++++++ source/lib/rocprofiler-sdk/agent.cpp | 23 +++++++++++++++---- .../rocprofiler-sdk/hsa/queue_controller.cpp | 4 +++- source/lib/rocprofiler-sdk/tests/agent.cpp | 4 +++- 4 files changed, 40 insertions(+), 7 deletions(-) diff --git a/source/include/rocprofiler-sdk/agent.h b/source/include/rocprofiler-sdk/agent.h index 641ed793..473ce4b1 100644 --- a/source/include/rocprofiler-sdk/agent.h +++ b/source/include/rocprofiler-sdk/agent.h @@ -191,6 +191,22 @@ typedef struct rocprofiler_agent_v0_t ///< HSA_AMD_AGENT_INFO_DRIVER_NODE_ID property int32_t logical_node_id; ///< Logical sequence number. This will always be [0..N) where N is ///< the total number of agents + int32_t logical_node_type_id; + int32_t reserved_padding0; ///< padding logical_node_id to 64 bytes + + /// @var logical_node_type_id + /// @brief Logical sequence number with respect to other agents of same type. This will always + /// be [0..N) where N is the total number of X agents (where X is a ::rocprofiler_agent_type_t + /// value). This field is intended to help with environment variable indexing used to mask GPUs + /// at runtime (i.e. HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES) which start at zero and only + /// apply to GPUs, e.g., logical_node_type_id value for first GPU will be 0, second GPU will + /// have value of 1, etc., regardless of however many agents of a different type preceeded (and + /// thus increased the ::node_id or ::logical_node_id). + /// + /// Example: a system with 2 CPUs and 2 GPUs, where the node ids are 0=CPU, 1=GPU, 2=CPU, 3=GPU, + /// then then CPU node_ids 0 and 2 would have logical_node_type_id values of 0 and 1, + /// respectively, and GPU node_ids 1 and 3 would also have logical_node_type_id values of 0 + /// and 1. } rocprofiler_agent_v0_t; typedef rocprofiler_agent_v0_t rocprofiler_agent_t; diff --git a/source/lib/rocprofiler-sdk/agent.cpp b/source/lib/rocprofiler-sdk/agent.cpp index ced22f8a..d6c86356 100644 --- a/source/lib/rocprofiler-sdk/agent.cpp +++ b/source/lib/rocprofiler-sdk/agent.cpp @@ -369,6 +369,9 @@ read_topology() auto data = std::vector{}; uint64_t idcount = 0; uint64_t nodecount = 0; + uint64_t cpucount = 0; + uint64_t gpucount = 0; + uint64_t unkcount = 0; while(true) { @@ -398,11 +401,12 @@ read_topology() // we may have been able to open the properties file but if it was empty, we ignore it if(properties.empty()) continue; - auto agent_info = common::init_public_api_struct(rocprofiler_agent_t{}); - agent_info.type = ROCPROFILER_AGENT_TYPE_NONE; - agent_info.logical_node_id = idcount++; - agent_info.node_id = node_id; - agent_info.id.handle = (agent_info.logical_node_id) + get_agent_offset(); + auto agent_info = common::init_public_api_struct(rocprofiler_agent_t{}); + agent_info.type = ROCPROFILER_AGENT_TYPE_NONE; + agent_info.logical_node_id = idcount++; + agent_info.node_id = node_id; + agent_info.id.handle = (agent_info.logical_node_id) + get_agent_offset(); + agent_info.logical_node_type_id = -1; if(!name_prop.empty()) agent_info.model_name = @@ -419,6 +423,15 @@ read_topology() agent_info.type = ROCPROFILER_AGENT_TYPE_CPU; else if(agent_info.simd_count > 0) agent_info.type = ROCPROFILER_AGENT_TYPE_GPU; + else + ROCP_WARNING << "agent " << agent_info.node_id << " is neither a CPU nor a GPU"; + + if(agent_info.type == ROCPROFILER_AGENT_TYPE_CPU) + agent_info.logical_node_type_id = cpucount++; + else if(agent_info.type == ROCPROFILER_AGENT_TYPE_GPU) + agent_info.logical_node_type_id = gpucount++; + else + agent_info.logical_node_type_id = unkcount++; read_property(properties, "mem_banks_count", agent_info.mem_banks_count); read_property(properties, "caches_count", agent_info.caches_count); diff --git a/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp b/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp index 83f2815c..0a778b0f 100644 --- a/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp +++ b/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp @@ -137,7 +137,9 @@ constexpr rocprofiler_agent_t default_agent = .product_name = nullptr, .model_name = nullptr, .node_id = 0, - .logical_node_id = 0}; + .logical_node_id = 0, + .logical_node_type_id = 0, + .reserved_padding0 = 0}; } // namespace void diff --git a/source/lib/rocprofiler-sdk/tests/agent.cpp b/source/lib/rocprofiler-sdk/tests/agent.cpp index 538be36b..ff4a0a93 100644 --- a/source/lib/rocprofiler-sdk/tests/agent.cpp +++ b/source/lib/rocprofiler-sdk/tests/agent.cpp @@ -103,9 +103,11 @@ TEST(rocprofiler_lib, agent_abi) EXPECT_EQ(offsetof(rocprofiler_agent_t, model_name), 272) << msg; EXPECT_EQ(offsetof(rocprofiler_agent_t, node_id), 280) << msg; EXPECT_EQ(offsetof(rocprofiler_agent_t, logical_node_id), 284) << msg; + EXPECT_EQ(offsetof(rocprofiler_agent_t, logical_node_type_id), 288) << msg; + EXPECT_EQ(offsetof(rocprofiler_agent_t, reserved_padding0), 292) << msg; // Add test for offset of new field above this. Do NOT change any existing values! - constexpr auto expected_rocp_agent_size = 288; + constexpr auto expected_rocp_agent_size = 296; // If a new field is added, increase this value by the size of the new field(s) EXPECT_EQ(sizeof(rocprofiler_agent_t), expected_rocp_agent_size) << "ABI break. If you added a new field, make sure that this is the only new check that "