diff --git a/CHANGELOG.md b/CHANGELOG.md index e4cb7345..379ed796 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -122,6 +122,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec - Changed naming of "dispatch profiling service" to a more descriptive "dispatch counting service". To convert existing tool or user code to the new names, the following sed can be used: `-type f -exec sed -i -e 's/dispatch_profile_counting_service/dispatch_counting_service/g' -e 's/dispatch_profile.h/dispatch_counting_service.h/g' -e 's/rocprofiler_profile_counting_dispatch_callback_t/rocprofiler_dispatch_counting_service_callback_t/g' -e 's/rocprofiler_profile_counting_dispatch_data_t/rocprofiler_dispatch_counting_service_data_t/g' -e 's/rocprofiler_profile_counting_dispatch_record_t/rocprofiler_dispatch_counting_service_record_t/g' {} +` - `FETCH_SIZE` metric on gfx94x now uses `TCC_BUBBLE` for 128B reads. - PMC dispatch-based counter collection serialization is now per-device instead of being global across all devices. +- Added output return functionality to rocprofiler_sample_device_counting_service - Added rocprofiler_load_counter_definition. ### Resolved issues diff --git a/samples/counter_collection/client.cpp b/samples/counter_collection/client.cpp index dbec8d5f..0a5e6412 100644 --- a/samples/counter_collection/client.cpp +++ b/samples/counter_collection/client.cpp @@ -358,7 +358,7 @@ tool_init(rocprofiler_client_finalize_t, void* user_data) // below to select the profile config to use when a kernel dispatch is // recieved. get_profile_cache().emplace( - agent.id.handle, build_profile_for_agent(agent.id, std::set{"SQ_WAVES"})); + agent.id.handle, build_profile_for_agent(agent.id, std::set{"TCC_HIT"})); } auto client_thread = rocprofiler_callback_thread_t{}; diff --git a/samples/counter_collection/device_counting.cpp b/samples/counter_collection/device_counting.cpp index c33bed18..62d5f5f7 100644 --- a/samples/counter_collection/device_counting.cpp +++ b/samples/counter_collection/device_counting.cpp @@ -289,8 +289,11 @@ tool_init(rocprofiler_client_finalize_t, void* user_data) rocprofiler_start_context(get_client_ctx()); while(exit_toggle().load() == false) { - rocprofiler_sample_device_counting_service( - get_client_ctx(), {.value = count}, ROCPROFILER_COUNTER_FLAG_NONE); + rocprofiler_sample_device_counting_service(get_client_ctx(), + {.value = count}, + ROCPROFILER_COUNTER_FLAG_NONE, + nullptr, + nullptr); count++; std::this_thread::sleep_for(std::chrono::milliseconds(50)); } diff --git a/source/include/rocprofiler-sdk/device_counting_service.h b/source/include/rocprofiler-sdk/device_counting_service.h index 9bc72534..7ba72cc4 100644 --- a/source/include/rocprofiler-sdk/device_counting_service.h +++ b/source/include/rocprofiler-sdk/device_counting_service.h @@ -106,18 +106,28 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i * @param [in] context_id context id * @param [in] user_data User supplied data, included in records outputted to buffer. * @param [in] flags Flags to specify how the counter data should be collected (defaults to sync). + * @param [in/out] output_records Output records collected via sampling (output is also written to + * buffer). Must be allocated by caller. + * @param [in/out] rec_count On entry, this is the maximum number of records rocprof can store in + * output_records. On exit, contains the number of actual records. * @return ::rocprofiler_status_t * @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID Returned if the context does not exist or * the context is not configured for agent profiling. * @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_ERROR Returned if another operation is in progress ( * start/stop ctx or another read). * @retval ::ROCPROFILER_STATUS_ERROR Returned if HSA has not been initialized yet. + * @retval ::ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES Returned output_records is set but size is + * too small to store results * @retval ::ROCPROFILER_STATUS_SUCCESS Returned if read request was successful. + * @retval ::ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT Returned If ASYNC is being used while + * output_records is not null. */ rocprofiler_status_t -rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id, - rocprofiler_user_data_t user_data, - rocprofiler_counter_flag_t flags) ROCPROFILER_API; +rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id, + rocprofiler_user_data_t user_data, + rocprofiler_counter_flag_t flags, + rocprofiler_record_counter_t* output_records, + size_t* rec_count) ROCPROFILER_API; /** @} */ diff --git a/source/lib/rocprofiler-sdk/counters/device_counting.cpp b/source/lib/rocprofiler-sdk/counters/device_counting.cpp index 2905bb90..8e6d4bf4 100644 --- a/source/lib/rocprofiler-sdk/counters/device_counting.cpp +++ b/source/lib/rocprofiler-sdk/counters/device_counting.cpp @@ -156,6 +156,10 @@ agent_async_handler(hsa_signal_value_t /*signal_v*/, void* data) { val.user_data = callback_data.user_data; val.agent_id = prof_config->agent->id; + if(callback_data.cached_counters) + { + callback_data.cached_counters->push_back(val); + } buf->emplace( ROCPROFILER_BUFFER_CATEGORY_COUNTERS, ROCPROFILER_COUNTER_RECORD_VALUE, val); } @@ -253,9 +257,10 @@ init_callback_data(rocprofiler::counters::agent_callback_data& callback_data, * and trigger the async handler manually. */ rocprofiler_status_t -read_agent_ctx(const context::context* ctx, - rocprofiler_user_data_t user_data, - rocprofiler_counter_flag_t flags) +read_agent_ctx(const context::context* ctx, + rocprofiler_user_data_t user_data, + rocprofiler_counter_flag_t flags, + std::vector* out_counters) { rocprofiler_status_t status = ROCPROFILER_STATUS_SUCCESS; if(!ctx->device_counter_collection) @@ -282,6 +287,18 @@ read_agent_ctx(const context::context* ctx, for(auto& callback_data : agent_ctx.agent_data) { + auto wait_if_sync = [&]() { + if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0) + { + // Wait for any inprogress samples to complete before returning + hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion, + HSA_SIGNAL_CONDITION_EQ, + 1, + UINT64_MAX, + HSA_WAIT_STATE_ACTIVE); + } + }; + if(!callback_data.profile || !callback_data.set_profile) continue; const auto* agent = agent::get_agent_cache(callback_data.profile->agent); @@ -295,6 +312,11 @@ read_agent_ctx(const context::context* ctx, // No AQL packet, nothing to do here. if(!callback_data.packet) continue; + wait_if_sync(); + + if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0) + callback_data.cached_counters = out_counters; + // If we have no hardware counters but a packet. The caller is expecting // non-hardware based counter values to be returned. We can skip packet injection // and trigger the async handler directly @@ -302,16 +324,7 @@ read_agent_ctx(const context::context* ctx, { callback_data.user_data = user_data; hsa::get_core_table()->hsa_signal_store_relaxed_fn(callback_data.completion, -1); - // Wait for the barrier/read packet to complete - if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC) - { - // Wait for any inprogress samples to complete before returning - hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion, - HSA_SIGNAL_CONDITION_EQ, - 1, - UINT64_MAX, - HSA_WAIT_STATE_ACTIVE); - } + wait_if_sync(); continue; } @@ -334,17 +347,8 @@ read_agent_ctx(const context::context* ctx, hsa::get_core_table()->hsa_signal_store_relaxed_fn(callback_data.completion, 0); callback_data.user_data = user_data; submitPacket(agent->profile_queue(), &barrier.barrier_and); - - // Wait for the barrier/read packet to complete - if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC) - { - // Wait for any inprogress samples to complete before returning - hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion, - HSA_SIGNAL_CONDITION_EQ, - 1, - UINT64_MAX, - HSA_WAIT_STATE_ACTIVE); - } + wait_if_sync(); + if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0) callback_data.cached_counters = nullptr; } agent_ctx.status.exchange(rocprofiler::context::device_counting_service::state::ENABLED); diff --git a/source/lib/rocprofiler-sdk/counters/device_counting.hpp b/source/lib/rocprofiler-sdk/counters/device_counting.hpp index 6a1a738b..874e6516 100644 --- a/source/lib/rocprofiler-sdk/counters/device_counting.hpp +++ b/source/lib/rocprofiler-sdk/counters/device_counting.hpp @@ -27,6 +27,7 @@ #include #include #include +#include namespace rocprofiler { @@ -55,11 +56,12 @@ struct agent_callback_data rocprofiler_user_data_t user_data = {.value = 0}; rocprofiler_user_data_t callback_data = {.value = 0}; - std::shared_ptr profile = {}; - rocprofiler_agent_id_t agent_id = {.handle = 0}; - rocprofiler_device_counting_service_callback_t cb = nullptr; - rocprofiler_buffer_id_t buffer = {.handle = 0}; - bool set_profile = false; + std::shared_ptr profile = {}; + rocprofiler_agent_id_t agent_id = {.handle = 0}; + rocprofiler_device_counting_service_callback_t cb = nullptr; + rocprofiler_buffer_id_t buffer = {.handle = 0}; + bool set_profile = false; + std::vector* cached_counters = nullptr; agent_callback_data() = default; agent_callback_data(agent_callback_data&& rhs) noexcept @@ -115,9 +117,10 @@ stop_agent_ctx(const context::context* ctx); // read calls are not allowed in ASYNC mode and will result in // this call waiting for the previous sample to complete. rocprofiler_status_t -read_agent_ctx(const context::context* ctx, - rocprofiler_user_data_t user_data, - rocprofiler_counter_flag_t flags); +read_agent_ctx(const context::context* ctx, + rocprofiler_user_data_t user_data, + rocprofiler_counter_flag_t flags, + std::vector* out_counters); uint64_t submitPacket(hsa_queue_t* queue, const void* packet); diff --git a/source/lib/rocprofiler-sdk/counters/tests/device_counting.cpp b/source/lib/rocprofiler-sdk/counters/tests/device_counting.cpp index 6e26cc81..c86b0091 100644 --- a/source/lib/rocprofiler-sdk/counters/tests/device_counting.cpp +++ b/source/lib/rocprofiler-sdk/counters/tests/device_counting.cpp @@ -108,10 +108,10 @@ test_init() hsa::get_queue_controller()->init(get_api_table(), get_ext_table()); } -std::vector& +common::Synchronized>& global_recs() { - static std::vector recs; + static common::Synchronized> recs; return recs; } @@ -146,7 +146,7 @@ check_output_created(rocprofiler_context_id_t, } found_value = record->user_data.value; // ROCP_ERROR << fmt::format("Found counter value: {}", record->counter_value); - global_recs().push_back(*record); + global_recs().wlock([&](auto& data) { data.push_back(*record); }); } } @@ -319,6 +319,7 @@ class device_counting_service_test : public ::testing::Test size_t track_metric = 0; for(auto& metric : metrics) { + std::vector output_records(10000); // global_recs().clear(); track_metric++; ROCP_ERROR << "Testing metric " << metric.name(); @@ -402,9 +403,23 @@ class device_counting_service_test : public ::testing::Test HSA_WAIT_STATE_BLOCKED); // Sample the counting service. - ROCPROFILER_CALL( - rocprofiler_sample_device_counting_service(ctx, {.value = track_metric}, flags), - "Could not sample"); + + if(flags == ROCPROFILER_COUNTER_FLAG_ASYNC) + { + ROCPROFILER_CALL(rocprofiler_sample_device_counting_service( + ctx, {.value = track_metric}, flags, nullptr, nullptr), + "Could not sample"); + } + else + { + global_recs().wlock([&](auto& _data) { _data.clear(); }); + size_t out_count = output_records.size(); + ROCPROFILER_CALL( + rocprofiler_sample_device_counting_service( + ctx, {.value = track_metric}, flags, output_records.data(), &out_count), + "Could not sample"); + output_records.resize(out_count); + } ROCPROFILER_CALL(rocprofiler_stop_context(ctx), "Could not stop context"); rocprofiler_flush_buffer(opt_buff_id); @@ -417,6 +432,27 @@ class device_counting_service_test : public ::testing::Test { ROCP_FATAL << "Failed to get data for " << metric.name(); } + else if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC) + { + auto recs_local = global_recs().rlock([](const auto& data) { return data; }); + + if(recs_local.size() != output_records.size()) + { + ROCP_FATAL << "Output size does not match: " << recs_local.size() << " " + << output_records.size(); + } + if(!std::equal(recs_local.begin(), + recs_local.end(), + output_records.begin(), + [](const auto& a, const auto& b) { + return a.id == b.id && a.counter_value == b.counter_value && + a.dispatch_id == b.dispatch_id && + a.agent_id.handle == b.agent_id.handle; + })) + { + ROCP_FATAL << "Output does not match between buffer and callback"; + } + } } hsa_signal_destroy(completion_signal); hsa_signal_destroy(found_data); @@ -599,9 +635,10 @@ TEST_F(device_counting_service_test, async_counters) { test_run(ROCPROFILER_COUN TEST_F(device_counting_service_test, sync_grbm_verify) { test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"GRBM_COUNT"}, 50000); - ROCP_ERROR << global_recs().size(); + auto local_recs = global_recs().rlock([](const auto& data) { return data; }); + ROCP_ERROR << local_recs.size(); - for(const auto& val : global_recs()) + for(const auto& val : local_recs) { rocprofiler_counter_id_t id; rocprofiler_query_record_counter_id(val.id, &id); @@ -615,9 +652,10 @@ TEST_F(device_counting_service_test, sync_grbm_verify) TEST_F(device_counting_service_test, sync_gpu_util_verify) { test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"GPU_UTIL"}, 50000); - ROCP_ERROR << global_recs().size(); + auto local_recs = global_recs().rlock([](const auto& data) { return data; }); + ROCP_ERROR << local_recs.size(); - for(const auto& val : global_recs()) + for(const auto& val : local_recs) { rocprofiler_counter_id_t id; rocprofiler_query_record_counter_id(val.id, &id); @@ -631,9 +669,10 @@ TEST_F(device_counting_service_test, sync_gpu_util_verify) TEST_F(device_counting_service_test, sync_sq_waves_verify) { test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"SQ_WAVES_sum"}, 50000); - ROCP_ERROR << global_recs().size(); + auto local_recs = global_recs().rlock([](const auto& data) { return data; }); + ROCP_ERROR << local_recs.size(); - for(const auto& val : global_recs()) + for(const auto& val : local_recs) { rocprofiler_counter_id_t id; rocprofiler_query_record_counter_id(val.id, &id); diff --git a/source/lib/rocprofiler-sdk/device_counting_service.cpp b/source/lib/rocprofiler-sdk/device_counting_service.cpp index 449a93fd..b1accd93 100644 --- a/source/lib/rocprofiler-sdk/device_counting_service.cpp +++ b/source/lib/rocprofiler-sdk/device_counting_service.cpp @@ -27,6 +27,8 @@ #include "lib/rocprofiler-sdk/counters/device_counting.hpp" #include "rocprofiler-sdk/fwd.h" +#include + extern "C" { rocprofiler_status_t rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_id, @@ -40,11 +42,35 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i } rocprofiler_status_t -rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id, - rocprofiler_user_data_t user_data, - rocprofiler_counter_flag_t flags) +rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id, + rocprofiler_user_data_t user_data, + rocprofiler_counter_flag_t flags, + rocprofiler_record_counter_t* output_records, + size_t* rec_count) { + if(output_records != nullptr) + { + if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0) + return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; + CHECK(rec_count); + auto recs = std::vector{}; + auto status = rocprofiler::counters::read_agent_ctx( + rocprofiler::context::get_registered_context(context_id), user_data, flags, &recs); + if(status == ROCPROFILER_STATUS_SUCCESS) + { + if(recs.size() > *rec_count) + { + *rec_count = recs.size(); + return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES; + } + *rec_count = recs.size(); + std::memcpy( + output_records, recs.data(), sizeof(rocprofiler_record_counter_t) * recs.size()); + } + return status; + } + return rocprofiler::counters::read_agent_ctx( - rocprofiler::context::get_registered_context(context_id), user_data, flags); + rocprofiler::context::get_registered_context(context_id), user_data, flags, nullptr); } }