Skip to content

Commit

Permalink
SDK: counter collection serialization per device (#1157)
Browse files Browse the repository at this point in the history
Migrates profiler_serializer class in QueueController to have an instance per-agent instead of one globally. Other changes in this commit are to allow for maps of the queues associated with each agent to be passed to profiler_serializer when it is turned on/off. Existing test cases cover whether or not the kernels are serialized (multistream app). New test case added to show that this serialization only occurs on a per device level with a kernel launched on one device waiting for a value to be set on the other.
  • Loading branch information
bwelton authored Oct 25, 2024
1 parent 5e1643c commit 4a5b1d9
Show file tree
Hide file tree
Showing 13 changed files with 275 additions and 51 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
- Fix Support for derived counters in reduce operation and bug fix for max in reduce
- Check to force tools to initialize context id with zero.
- Fix to handle a range of values for select() dimension in expressions parser.
- PMC dispatch based Counter Collection Serialization is now per-device instead of global across all devices.

### Removed

Expand Down
22 changes: 22 additions & 0 deletions samples/counter_collection/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,28 @@ set_tests_properties(
"${counter-collection-buffer-env}" FAIL_REGULAR_EXPRESSION
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")

set_source_files_properties(per_dev_serialization.cpp PROPERTIES LANGUAGE HIP)
add_executable(counter-collection-buffer-device-serialization)
target_sources(counter-collection-buffer-device-serialization
PRIVATE per_dev_serialization.cpp)
target_link_libraries(counter-collection-buffer-device-serialization
PRIVATE counter-collection-buffer-client Threads::Threads)

rocprofiler_samples_get_ld_library_path_env(LIBRARY_PATH_ENV)
rocprofiler_samples_get_preload_env(PRELOAD_ENV counter-collection-buffer-client)

set(counter-collection-buffer-device-serialization-env "${PRELOAD_ENV}"
"${LIBRARY_PATH_ENV}")

add_test(NAME counter-collection-buffer-device-serialization
COMMAND $<TARGET_FILE:counter-collection-buffer-device-serialization>)

set_tests_properties(
counter-collection-buffer-device-serialization
PROPERTIES TIMEOUT 120 LABELS "samples" ENVIRONMENT
"${counter-collection-buffer-device-serialization-env}"
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")

add_library(counter-collection-callback-client SHARED)
target_sources(counter-collection-callback-client PRIVATE callback_client.cpp client.hpp)
target_link_libraries(
Expand Down
32 changes: 31 additions & 1 deletion samples/counter_collection/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include "client.hpp"

#include <cstdint>
#include <fstream>
#include <functional>
#include <iostream>
Expand Down Expand Up @@ -74,13 +75,34 @@ get_buffer()
return buf;
}

std::unordered_map<uint64_t, std::vector<rocprofiler_record_dimension_info_t>>**
dimension_cache()
{
static std::unordered_map<uint64_t, std::vector<rocprofiler_record_dimension_info_t>>* cache;
return &cache;
}

/**
* For a given counter, query the dimensions that it has. Typically you will
* want to call this function once to get the dimensions and cache them.
*/
std::vector<rocprofiler_record_dimension_info_t>
counter_dimensions(rocprofiler_counter_id_t counter)
{
if(*dimension_cache() == nullptr) return {};

if((*dimension_cache())->count(counter.handle) > 0)
{
return (*dimension_cache())->at(counter.handle);
}

return {};
}

void
fill_dimension_cache(rocprofiler_counter_id_t counter)
{
assert(*dimension_cache() != nullptr);
std::vector<rocprofiler_record_dimension_info_t> dims;
rocprofiler_available_dimensions_cb_t cb =
[](rocprofiler_counter_id_t,
Expand All @@ -97,7 +119,7 @@ counter_dimensions(rocprofiler_counter_id_t counter)
};
ROCPROFILER_CALL(rocprofiler_iterate_counter_dimensions(counter, cb, &dims),
"Could not iterate counter dimensions");
return dims;
(*dimension_cache())->emplace(counter.handle, dims);
}

/**
Expand Down Expand Up @@ -251,6 +273,7 @@ build_profile_for_agent(rocprofiler_agent_id_t agent,
{
std::clog << "Counter: " << counter.handle << " " << version.name << "\n";
collect_counters.push_back(counter);
fill_dimension_cache(counter);
}
}

Expand Down Expand Up @@ -375,6 +398,10 @@ tool_fini(void* user_data)
auto* output_stream = static_cast<std::ostream*>(user_data);
*output_stream << std::flush;
if(output_stream != &std::cout && output_stream != &std::cerr) delete output_stream;

auto* tmp_ptr = *dimension_cache();
*dimension_cache() = nullptr;
delete tmp_ptr;
}
} // namespace

Expand Down Expand Up @@ -416,6 +443,9 @@ rocprofiler_configure(uint32_t version,
&tool_fini,
static_cast<void*>(output_stream)};

*dimension_cache() =
new std::unordered_map<uint64_t, std::vector<rocprofiler_record_dimension_info_t>>();

// return pointer to configure data
return &cfg;
}
71 changes: 71 additions & 0 deletions samples/counter_collection/per_dev_serialization.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#include <hip/hip_runtime.h>

#include "client.hpp"

#define HIP_CALL(call) \
do \
{ \
hipError_t err = call; \
if(err != hipSuccess) \
{ \
fprintf(stderr, "%s\n", hipGetErrorString(err)); \
abort(); \
} \
} while(0)

__global__ void
kernelA(int* wait_on, int value, int* no_opt)
{
while(*wait_on != value)
{
(*no_opt)++;
};
(*wait_on)--;
}

int
main(int, char**)
{
int ntotdevice = 0;
HIP_CALL(hipGetDeviceCount(&ntotdevice));
if(ntotdevice < 2) return 0;

start();
int* check_value = nullptr;
int* no_opt = nullptr;
HIP_CALL(hipMallocManaged(&check_value, sizeof(*check_value)));
HIP_CALL(hipMallocManaged(&no_opt, sizeof(*no_opt)));
*no_opt = 0;
*check_value = 1;
// Will hang if per-device serialization is not functional
HIP_CALL(hipSetDevice(0));
hipLaunchKernelGGL(kernelA, dim3(1), dim3(1), 0, 0, check_value, 0, no_opt);
HIP_CALL(hipSetDevice(1));
hipLaunchKernelGGL(kernelA, dim3(1), dim3(1), 0, 0, check_value, 1, no_opt);
HIP_CALL(hipSetDevice(0));
HIP_CALL(hipDeviceSynchronize());

std::cerr << "Run complete\n";
}
20 changes: 11 additions & 9 deletions source/lib/rocprofiler-sdk/counters/dispatch_handlers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,14 @@ queue_cb(const context::context* ctx,
// and maybe adds barrier packets if the state is transitioning from serialized <->
// unserialized
auto maybe_add_serialization = [&](auto& gen_pkt) {
CHECK_NOTNULL(hsa::get_queue_controller())->serializer().rlock([&](const auto& serializer) {
for(auto& s_pkt : serializer.kernel_dispatch(queue))
{
gen_pkt->before_krn_pkt.push_back(s_pkt.ext_amd_aql_pm4);
}
});
CHECK_NOTNULL(hsa::get_queue_controller())
->serializer(&queue)
.rlock([&](const auto& serializer) {
for(auto& s_pkt : serializer.kernel_dispatch(queue))
{
gen_pkt->before_krn_pkt.push_back(s_pkt.ext_amd_aql_pm4);
}
});
};

// Packet generated when no instrumentation is performed. May contain serialization
Expand Down Expand Up @@ -189,9 +191,9 @@ completed_cb(const context::context* ctx,

if(!pkt) return;

CHECK_NOTNULL(hsa::get_queue_controller())->serializer().wlock([&](auto& serializer) {
serializer.kernel_completion_signal(session.queue);
});
CHECK_NOTNULL(hsa::get_queue_controller())
->serializer(&session.queue)
.wlock([&](auto& serializer) { serializer.kernel_completion_signal(session.queue); });

// We have no profile config, nothing to output.
if(!prof_config) return;
Expand Down
4 changes: 2 additions & 2 deletions source/lib/rocprofiler-sdk/hsa/hsa_barrier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,13 @@ hsa_barrier::~hsa_barrier()
}

void
hsa_barrier::set_barrier(const queue_map_t& q)
hsa_barrier::set_barrier(const queue_map_ptr_t& q)
{
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 1);
_queue_waiting.wlock([&](auto& queue_waiting) {
for(const auto& [_, queue] : q)
{
queue->lock_queue([ptr = queue.get(), &queue_waiting]() {
queue->lock_queue([ptr = queue, &queue_waiting]() {
if(ptr->active_async_packets() > 0)
{
queue_waiting[ptr->get_id().handle] = ptr->active_async_packets();
Expand Down
5 changes: 2 additions & 3 deletions source/lib/rocprofiler-sdk/hsa/hsa_barrier.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,11 @@ class Queue;
class hsa_barrier
{
public:
using queue_map_t = std::unordered_map<hsa_queue_t*, std::unique_ptr<Queue>>;

using queue_map_ptr_t = std::unordered_map<hsa_queue_t*, Queue*>;
hsa_barrier(std::function<void()>&& finished, CoreApiTable core_api);
~hsa_barrier();

void set_barrier(const queue_map_t& q);
void set_barrier(const queue_map_ptr_t& q);

std::optional<rocprofiler_packet> enqueue_packet(const Queue* queue);
bool register_completion(const Queue* queue);
Expand Down
10 changes: 5 additions & 5 deletions source/lib/rocprofiler-sdk/hsa/profile_serializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ profiler_serializer_ready_signal_handler(hsa_signal_value_t /* signal_value */,
auto* hsa_queue = static_cast<hsa_queue_t*>(data);
const auto* queue = CHECK_NOTNULL(get_queue_controller())->get_queue(*hsa_queue);
CHECK(queue);
CHECK_NOTNULL(get_queue_controller())->serializer().wlock([&](auto& serializer) {
CHECK_NOTNULL(get_queue_controller())->serializer(queue).wlock([&](auto& serializer) {
serializer.queue_ready(hsa_queue, *queue);
});
return true;
Expand Down Expand Up @@ -242,7 +242,7 @@ profiler_serializer::destroy_queue(hsa_queue_t* id, const Queue& queue)

// Enable the serializer
void
profiler_serializer::enable(const queue_map_t& queues)
profiler_serializer::enable(const hsa_barrier::queue_map_ptr_t& queues)
{
if(_serializer_status == Status::ENABLED) return;

Expand All @@ -257,14 +257,14 @@ profiler_serializer::enable(const queue_map_t& queues)
std::make_unique<hsa_barrier>(
[] {}, CHECK_NOTNULL(get_queue_controller())->get_core_table()));
_serializer_status = Status::ENABLED;
_barrier.back().barrier->set_barrier(queues);

_barrier.back().barrier->set_barrier(queues);
ROCP_INFO << "Profiler serialization enabled";
}

// Disable the serializer
void
profiler_serializer::disable(const queue_map_t& queues)
profiler_serializer::disable(const hsa_barrier::queue_map_ptr_t& queues)
{
if(_serializer_status == Status::DISABLED) return;

Expand All @@ -279,8 +279,8 @@ profiler_serializer::disable(const queue_map_t& queues)
std::make_unique<hsa_barrier>(
[] {}, CHECK_NOTNULL(get_queue_controller())->get_core_table()));
_serializer_status = Status::DISABLED;
_barrier.back().barrier->set_barrier(queues);

_barrier.back().barrier->set_barrier(queues);
ROCP_INFO << "Profiler serialization disabled";
}

Expand Down
5 changes: 2 additions & 3 deletions source/lib/rocprofiler-sdk/hsa/profile_serializer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,16 @@ class profiler_serializer
std::unique_ptr<hsa_barrier> barrier;
};

using queue_map_t = std::unordered_map<hsa_queue_t*, std::unique_ptr<Queue>>;
void kernel_completion_signal(const Queue&);
// Signal a kernel dispatch is taking place, generates packets needed to be
// inserted to support kernel dispatch
common::container::small_vector<hsa::rocprofiler_packet, 3> kernel_dispatch(const Queue&) const;

void queue_ready(hsa_queue_t* hsa_queue, const Queue& queue);
// Enable the serializer
void enable(const queue_map_t& queues);
void enable(const hsa_barrier::queue_map_ptr_t& queues);
// Disable the serializer
void disable(const queue_map_t& queues);
void disable(const hsa_barrier::queue_map_ptr_t& queues);

void destroy_queue(hsa_queue_t* id, const Queue& queue);

Expand Down
Loading

0 comments on commit 4a5b1d9

Please sign in to comment.