From 16d535ef48f0d6ecf83d5886a94019de7f2ed29a Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Tue, 30 Jul 2024 19:57:19 -0500 Subject: [PATCH 1/7] rocprofv3 OTF2 Output Support (#995) * CMake support for OTF2 library * Preliminary OTF2 generation implementation * Completed OTF2 Support - HSA API - HIP API - Marker API - Async Memory Copies - Kernel Dispatch * Update lib/rocprofiler-sdk-tool/generateOTF2.cpp - fix location type for dispatches * Testing for OTF2 output * Add OTF2 to requirements.txt * Update lib/rocprofiler-sdk-tool/generateOTF2.cpp - fix getting kernel name * OTF2 testing with rocprofv3/tracing-hip-in-libraries * Format external/otf2/CMakeLists.txt * Update external/otf2/CMakeLists.txt - guard CMP0135 for cmake < 3.24 * Update lib/rocprofiler-sdk-tool/generateOTF2.cpp - fix duplicate string ref issue * Update lib/rocprofiler-sdk-tool/generateOTF2.cpp - fix header includes * Update CI workflow - sudo install pypi requirements for core-rpm for $HOME/.local installs * Update pytest_utils/otf2_reader.py - modifications for reading trace * Update pytest_utils/otf2_reader.py - misc cleanup * Update CI workflow - fix installer artifact naming * Update pytest_utils/otf2_reader.py - handle slightly overlapping kernel timestamps for MI300 * OTF2 attributes for category * Testing with OTF2Reader category attributes * Fix memory leak in OTF2 generation - leaking OTF2_AttributeList --- .github/workflows/continuous_integration.yml | 11 +- cmake/rocprofiler_config_interfaces.cmake | 8 + cmake/rocprofiler_interfaces.cmake | 1 + external/CMakeLists.txt | 3 + external/otf2/CMakeLists.txt | 72 ++ requirements.txt | 1 + source/bin/rocprofv3.py | 4 +- .../include/rocprofiler-sdk/cxx/perfetto.hpp | 3 +- source/lib/common/utility.hpp | 4 +- .../lib/rocprofiler-sdk-tool/CMakeLists.txt | 5 +- source/lib/rocprofiler-sdk-tool/config.cpp | 64 +- source/lib/rocprofiler-sdk-tool/config.hpp | 1 + .../lib/rocprofiler-sdk-tool/generateOTF2.cpp | 796 ++++++++++++++++++ .../lib/rocprofiler-sdk-tool/generateOTF2.hpp | 44 + .../lib/rocprofiler-sdk-tool/output_file.cpp | 29 +- .../lib/rocprofiler-sdk-tool/output_file.hpp | 3 + source/lib/rocprofiler-sdk-tool/tool.cpp | 19 +- tests/pytest-packages/CMakeLists.txt | 2 +- .../pytest_utils/otf2_reader.py | 201 +++++ tests/pytest-packages/tests/rocprofv3.py | 27 + tests/rocprofv3/kernel-rename/CMakeLists.txt | 10 +- tests/rocprofv3/kernel-rename/conftest.py | 15 + .../kernel-rename/input-kernel-rename.yml | 2 +- tests/rocprofv3/kernel-rename/validate.py | 8 + .../tracing-hip-in-libraries/CMakeLists.txt | 102 +-- .../tracing-hip-in-libraries/conftest.py | 13 +- .../tracing-hip-in-libraries/validate.py | 7 + 27 files changed, 1318 insertions(+), 137 deletions(-) create mode 100644 external/otf2/CMakeLists.txt create mode 100644 source/lib/rocprofiler-sdk-tool/generateOTF2.cpp create mode 100644 source/lib/rocprofiler-sdk-tool/generateOTF2.hpp create mode 100644 tests/pytest-packages/pytest_utils/otf2_reader.py diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 57f0b034..e202beb5 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -77,7 +77,7 @@ jobs: shell: bash run: | echo 'ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1' >> $GITHUB_ENV - + - name: Configure, Build, and Test timeout-minutes: 30 shell: bash @@ -150,12 +150,12 @@ jobs: if: ${{ contains(matrix.runner, env.CORE_EXT_RUNNER) }} uses: actions/upload-artifact@v4 with: - name: installers + name: installers-deb path: | ${{github.workspace}}/build/*.deb ${{github.workspace}}/build/*.rpm ${{github.workspace}}/build/*.tgz - + core-rpm: # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix strategy: @@ -181,6 +181,7 @@ jobs: shell: bash run: | python3 -m pip install -r requirements.txt + sudo python3 -m pip install -r requirements.txt - name: List Files shell: bash @@ -190,7 +191,7 @@ jobs: which-realpath() { echo -e "\n$1 resolves to $(realpath $(which $1))"; echo "$($(which $1) --version &> /dev/stdout | head -n 1)"; } for i in python3 git cmake ctest gcc g++ gcov; do which-realpath $i; done ls -la - + - name: Exclude PC Sampling Tests if: ${{ !contains(matrix.runner, 'mi200') && !contains(matrix.runner, 'mi300') }} shell: bash @@ -275,7 +276,7 @@ jobs: if: ${{ contains(matrix.runner, env.CORE_EXT_RUNNER) }} uses: actions/upload-artifact@v4 with: - name: installers + name: installers-rpm path: | ${{github.workspace}}/build/*.deb ${{github.workspace}}/build/*.rpm diff --git a/cmake/rocprofiler_config_interfaces.cmake b/cmake/rocprofiler_config_interfaces.cmake index e25f4730..311e4d55 100644 --- a/cmake/rocprofiler_config_interfaces.cmake +++ b/cmake/rocprofiler_config_interfaces.cmake @@ -276,3 +276,11 @@ target_link_libraries(rocprofiler-drm INTERFACE ${drm_LIBRARY} ${drm_amdgpu_LIBR # get_target_property(ELFIO_INCLUDE_DIR elfio::elfio INTERFACE_INCLUDE_DIRECTORIES) # target_include_directories(rocprofiler-elfio SYSTEM INTERFACE ${ELFIO_INCLUDE_DIR}) target_link_libraries(rocprofiler-elfio INTERFACE elfio::elfio) + +# ----------------------------------------------------------------------------------------# +# +# OTF2 +# +# ----------------------------------------------------------------------------------------# + +target_link_libraries(rocprofiler-otf2 INTERFACE otf2::otf2) diff --git a/cmake/rocprofiler_interfaces.cmake b/cmake/rocprofiler_interfaces.cmake index 77173783..01ca370c 100644 --- a/cmake/rocprofiler_interfaces.cmake +++ b/cmake/rocprofiler_interfaces.cmake @@ -18,6 +18,7 @@ rocprofiler_add_interface_library(rocprofiler-threading "Enables multithreading INTERNAL) rocprofiler_add_interface_library(rocprofiler-perfetto "Enables Perfetto support" INTERNAL) +rocprofiler_add_interface_library(rocprofiler-otf2 "Enables OTF2 support" INTERNAL) rocprofiler_add_interface_library(rocprofiler-cereal "Enables Cereal support" INTERNAL) rocprofiler_add_interface_library(rocprofiler-compile-definitions "Compile definitions" INTERNAL) diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt index 149aa546..075d0b30 100644 --- a/external/CMakeLists.txt +++ b/external/CMakeLists.txt @@ -204,3 +204,6 @@ if(TARGET rocprofiler-elfio) get_target_property(ELFIO_INCLUDE_DIR elfio::elfio INTERFACE_INCLUDE_DIRECTORIES) target_include_directories(rocprofiler-elfio SYSTEM INTERFACE ${ELFIO_INCLUDE_DIR}) endif() + +# OTF2 +add_subdirectory(otf2) diff --git a/external/otf2/CMakeLists.txt b/external/otf2/CMakeLists.txt new file mode 100644 index 00000000..252449b8 --- /dev/null +++ b/external/otf2/CMakeLists.txt @@ -0,0 +1,72 @@ +# ====================================================================================== +# Builds OTF2 +# ====================================================================================== + +set(ROCPROFILER_BINARY_DIR ${PROJECT_BINARY_DIR}) +set(OTF2_VERSION + "3.0.3" + CACHE STRING "OTF2 version") +set(OTF2_URL_HASH + "SHA256=18a3905f7917340387e3edc8e5766f31ab1af41f4ecc5665da6c769ca21c4ee8" + CACHE STRING "OTF2 URL download hash") + +project( + OTF2 + LANGUAGES C + VERSION ${OTF2_VERSION} + DESCRIPTION "Open Trace Format v2" + HOMEPAGE_URL "https://perftools.pages.jsc.fz-juelich.de/cicd/otf2") + +include(FetchContent) +include(ExternalProject) +include(ProcessorCount) + +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24) + cmake_policy(SET CMP0135 NEW) +endif() + +set(FETCHCONTENT_BASE_DIR ${ROCPROFILER_BINARY_DIR}/external/packages) + +fetchcontent_declare( + otf2-source + URL https://perftools.pages.jsc.fz-juelich.de/cicd/otf2/tags/otf2-${OTF2_VERSION}/otf2-${OTF2_VERSION}.tar.gz + URL_HASH ${OTF2_URL_HASH}) + +fetchcontent_getproperties(ot2f-source) + +if(NOT ot2f-source_POPULATED) + message(STATUS "Downloading OTF2...") + fetchcontent_populate(otf2-source) +endif() + +set(_otf2_root ${ROCPROFILER_BINARY_DIR}/external/otf2) +set(_otf2_inc_dirs $) +set(_otf2_lib_dirs $) +set(_otf2_libs $) +set(_otf2_build_byproducts "${_otf2_root}/lib/libotf2${CMAKE_STATIC_LIBRARY_SUFFIX}") + +find_program( + MAKE_COMMAND + NAMES make gmake + PATH_SUFFIXES bin REQUIRED) + +externalproject_add( + otf2-build + PREFIX ${_otf2_root} + SOURCE_DIR ${otf2-source_SOURCE_DIR} + BUILD_IN_SOURCE 1 + DOWNLOAD_COMMAND "" + PATCH_COMMAND + ${CMAKE_COMMAND} -E env CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} + /configure -q --prefix=${_otf2_root} CFLAGS=-fPIC\ -O3\ -g + CXXFLAGS=-fPIC\ -O3\ -g PYTHON=: SPHINX=: + CONFIGURE_COMMAND ${MAKE_COMMAND} install -s + BUILD_COMMAND "" + BUILD_BYPRODUCTS "${_otf2_build_byproducts}" + INSTALL_COMMAND "") + +add_library(otf2 INTERFACE) +add_library(otf2::otf2 ALIAS otf2) +target_include_directories(otf2 SYSTEM INTERFACE ${_otf2_inc_dirs}) +target_link_directories(otf2 INTERFACE ${_otf2_lib_dirs}) +target_link_libraries(otf2 INTERFACE ${_otf2_libs}) diff --git a/requirements.txt b/requirements.txt index 4195d9d0..9ea6617f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ cmake>=3.21.0 cmake-format dataclasses numpy +otf2 pandas perfetto pycobertura diff --git a/source/bin/rocprofv3.py b/source/bin/rocprofv3.py index a5d47a36..fadaa32b 100755 --- a/source/bin/rocprofv3.py +++ b/source/bin/rocprofv3.py @@ -196,10 +196,10 @@ def add_parser_bool_argument(*args, **kwargs): ) parser.add_argument( "--output-format", - help="For adding output format (supported formats: csv, json, pftrace)", + help="For adding output format (supported formats: csv, json, pftrace, otf2)", nargs="+", default=None, - choices=("csv", "json", "pftrace"), + choices=("csv", "json", "pftrace", "otf2"), type=str.lower, ) parser.add_argument( diff --git a/source/include/rocprofiler-sdk/cxx/perfetto.hpp b/source/include/rocprofiler-sdk/cxx/perfetto.hpp index dabc494e..6ceb5165 100644 --- a/source/include/rocprofiler-sdk/cxx/perfetto.hpp +++ b/source/include/rocprofiler-sdk/cxx/perfetto.hpp @@ -53,7 +53,8 @@ { \ namespace NS \ { \ - struct VALUE; \ + struct VALUE \ + {}; \ } \ } \ } \ diff --git a/source/lib/common/utility.hpp b/source/lib/common/utility.hpp index b2e0eabf..255606ef 100644 --- a/source/lib/common/utility.hpp +++ b/source/lib/common/utility.hpp @@ -79,9 +79,11 @@ get_ticks(clockid_t clk_id_v) noexcept return (static_cast(ts.tv_sec) * nanosec) + static_cast(ts.tv_nsec); } +static constexpr int default_clock_id = CLOCK_BOOTTIME; + // CLOCK_MONOTONIC_RAW equates to HSA-runtime library implementation of os::ReadAccurateClock() // CLOCK_BOOTTIME equates to HSA-runtime library implementation of os::ReadSystemClock() -template +template inline uint64_t timestamp_ns() { diff --git a/source/lib/rocprofiler-sdk-tool/CMakeLists.txt b/source/lib/rocprofiler-sdk-tool/CMakeLists.txt index ccfe1dae..22a361dd 100644 --- a/source/lib/rocprofiler-sdk-tool/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk-tool/CMakeLists.txt @@ -11,6 +11,7 @@ set(TOOL_HEADERS domain_type.hpp generateCSV.hpp generateJSON.hpp + generateOTF2.hpp generatePerfetto.hpp helper.hpp output_file.hpp @@ -23,6 +24,7 @@ set(TOOL_SOURCES domain_type.cpp generateCSV.cpp generateJSON.cpp + generateOTF2.cpp generatePerfetto.cpp helper.cpp main.c @@ -44,7 +46,8 @@ target_link_libraries( rocprofiler-sdk::rocprofiler-memcheck rocprofiler-sdk::rocprofiler-common-library rocprofiler-sdk::rocprofiler-cereal - rocprofiler-sdk::rocprofiler-perfetto) + rocprofiler-sdk::rocprofiler-perfetto + rocprofiler-sdk::rocprofiler-otf2) set_target_properties( rocprofiler-sdk-tool diff --git a/source/lib/rocprofiler-sdk-tool/config.cpp b/source/lib/rocprofiler-sdk-tool/config.cpp index 68850653..65f8fdec 100644 --- a/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/source/lib/rocprofiler-sdk-tool/config.cpp @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -51,24 +52,35 @@ namespace tool { namespace { +template +auto +as_pointer(Tp&& _val) +{ + return new Tp{_val}; +} + std::string* -get_local_datetime(const std::string& dt_format); +get_local_datetime(const std::string& dt_format, std::time_t*& dt_curr); -const auto* launch_datetime = get_local_datetime(get_env("ROCP_TIME_FORMAT", "%F_%H.%M")); -const auto env_regexes = +std::time_t* launch_time = nullptr; +const auto* launch_clock = as_pointer(std::chrono::system_clock::now()); +const auto* launch_datetime = + get_local_datetime(get_env("ROCP_TIME_FORMAT", "%F_%H.%M"), launch_time); +const auto env_regexes = new std::array{std::regex{"(.*)%(env|ENV)\\{([A-Z0-9_]+)\\}%(.*)"}, std::regex{"(.*)\\$(env|ENV)\\{([A-Z0-9_]+)\\}(.*)"}}; std::string* -get_local_datetime(const std::string& dt_format) +get_local_datetime(const std::string& dt_format, std::time_t*& _dt_curr) { constexpr auto strsize = 512; - auto dt_curr = std::time_t{std::time(nullptr)}; - char mbstr[strsize]; + if(!_dt_curr) _dt_curr = new std::time_t{std::time_t{std::time(nullptr)}}; + + char mbstr[strsize] = {}; memset(mbstr, '\0', sizeof(mbstr) * sizeof(char)); - if(std::strftime(mbstr, sizeof(mbstr) - 1, dt_format.c_str(), std::localtime(&dt_curr)) != 0) + if(std::strftime(mbstr, sizeof(mbstr) - 1, dt_format.c_str(), std::localtime(_dt_curr)) != 0) return new std::string{mbstr}; return nullptr; @@ -245,39 +257,23 @@ config::config() get_env("ROCPROF_KERNEL_FILTER_RANGE", std::string{}))} , counters{parse_counters(get_env("ROCPROF_COUNTERS", std::string{}))} { - auto output_format = get_env("ROCPROF_OUTPUT_FORMAT", "CSV"); - - for(auto& itr : output_format) - itr = toupper(itr); - - for(auto itr : {',', ';', ':'}) - { - auto pos = std::string::npos; - do - { - pos = output_format.find(itr); - if(pos != std::string::npos) output_format.replace(pos, 1, " "); - } while(pos != std::string::npos); - } - - auto entries = std::set{}; - auto parser = std::stringstream{output_format}; + auto to_upper = [](std::string val) { + for(auto& vitr : val) + vitr = toupper(vitr); + return val; + }; - while(true) - { - auto _val = std::string{}; - parser >> _val; - if(!_val.empty()) - entries.emplace(_val); - else - break; - } + auto output_format = get_env("ROCPROF_OUTPUT_FORMAT", "CSV"); + auto entries = std::set{}; + for(const auto& itr : sdk::parse::tokenize(output_format, " \t,;:")) + entries.emplace(to_upper(itr)); csv_output = entries.count("CSV") > 0 || entries.empty(); json_output = entries.count("JSON") > 0; pftrace_output = entries.count("PFTRACE") > 0; + otf2_output = entries.count("OTF2") > 0; - const auto supported_formats = std::set{"CSV", "JSON", "PFTRACE"}; + const auto supported_formats = std::set{"CSV", "JSON", "PFTRACE", "OTF2"}; for(const auto& itr : entries) { LOG_IF(FATAL, supported_formats.count(itr) == 0) diff --git a/source/lib/rocprofiler-sdk-tool/config.hpp b/source/lib/rocprofiler-sdk-tool/config.hpp index 58a482c0..8c430e98 100644 --- a/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/source/lib/rocprofiler-sdk-tool/config.hpp @@ -77,6 +77,7 @@ struct config bool csv_output = false; bool json_output = false; bool pftrace_output = false; + bool otf2_output = false; bool kernel_rename = get_env("ROCPROF_KERNEL_RENAME", false); int mpi_size = get_mpi_size(); int mpi_rank = get_mpi_rank(); diff --git a/source/lib/rocprofiler-sdk-tool/generateOTF2.cpp b/source/lib/rocprofiler-sdk-tool/generateOTF2.cpp new file mode 100644 index 00000000..56255b04 --- /dev/null +++ b/source/lib/rocprofiler-sdk-tool/generateOTF2.cpp @@ -0,0 +1,796 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "generateOTF2.hpp" +#include "helper.hpp" +#include "lib/common/filesystem.hpp" +#include "lib/common/mpl.hpp" +#include "lib/common/units.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk-tool/config.hpp" +#include "output_file.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define OTF2_CHECK(result) \ + { \ + OTF2_ErrorCode ROCPROFILER_VARIABLE(CHECKSTATUS, __LINE__) = result; \ + if(ROCPROFILER_VARIABLE(CHECKSTATUS, __LINE__) != OTF2_SUCCESS) \ + { \ + auto _err_name = OTF2_Error_GetName(ROCPROFILER_VARIABLE(CHECKSTATUS, __LINE__)); \ + auto _err_msg = \ + OTF2_Error_GetDescription(ROCPROFILER_VARIABLE(CHECKSTATUS, __LINE__)); \ + ROCP_FATAL << #result << " failed with error code " << _err_name \ + << " (code=" << ROCPROFILER_VARIABLE(CHECKSTATUS, __LINE__) \ + << ") :: " << _err_msg; \ + } \ + } + +namespace rocprofiler +{ +namespace tool +{ +namespace +{ +template +struct array_hash +{ + size_t operator()(const std::array& _data) const + { + constexpr size_t seed = 0x9e3779b9; + size_t _val = 0; + for(const auto& itr : _data) + _val ^= std::hash{}(itr) + seed + (_val << 6) + (_val >> 2); + return _val; + } + + template + size_t operator()(Up... _data) const + { + static_assert(sizeof...(Up) == N, "Insufficient data"); + return operator()(std::array{std::forward(_data)...}); + } +}; + +struct region_info +{ + std::string name = {}; + OTF2_RegionRole_enum region_role = OTF2_REGION_ROLE_FUNCTION; + OTF2_Paradigm_enum paradigm = OTF2_PARADIGM_HIP; +}; + +OTF2_FlushType +pre_flush(void* userData, + OTF2_FileType fileType, + OTF2_LocationRef location, + void* callerData, + bool fini); + +OTF2_TimeStamp +post_flush(void* userData, OTF2_FileType fileType, OTF2_LocationRef location); + +template +void +consume_variables(Args&&...) +{} + +using event_writer_t = OTF2_EvtWriter; +using archive_t = OTF2_Archive; +using attribute_list_t = OTF2_AttributeList; +using hash_value_t = size_t; +using hash_map_t = std::unordered_map; + +auto main_tid = common::get_tid(); +archive_t* archive = nullptr; +auto flush_callbacks = OTF2_FlushCallbacks{pre_flush, post_flush}; + +struct location_base +{ + uint64_t pid = 0; + rocprofiler_thread_id_t tid = 0; + rocprofiler_agent_id_t agent = {.handle = 0}; + rocprofiler_queue_id_t queue = {.handle = 0}; + + location_base(uint64_t _pid, + rocprofiler_thread_id_t _tid, + rocprofiler_agent_id_t _agent = {.handle = 0}, + rocprofiler_queue_id_t _queue = {.handle = 0}) + : pid{_pid} + , tid{_tid} + , agent{_agent} + , queue{_queue} + {} + + auto hash() const + { + return array_hash{}(pid, tid, agent.handle + 1, queue.handle + 1); + } +}; + +bool +operator<(const location_base& lhs, const location_base& rhs) +{ + return std::tie(lhs.pid, lhs.tid, lhs.agent.handle, lhs.queue.handle) < + std::tie(rhs.pid, rhs.tid, rhs.agent.handle, rhs.queue.handle); +} + +struct location_data : location_base +{ + location_data(uint64_t _pid, + rocprofiler_thread_id_t _tid, + rocprofiler_agent_id_t _agent = {.handle = 0}, + rocprofiler_queue_id_t _queue = {.handle = 0}) + : location_base{_pid, _tid, _agent, _queue} + , index{++index_counter} + , event_writer{OTF2_Archive_GetEvtWriter(CHECK_NOTNULL(archive), index)} + { + CHECK_NOTNULL(event_writer); + } + + using location_base::hash; + + static uint64_t index_counter; + + uint64_t index = 0; + event_writer_t* event_writer = nullptr; + + bool operator==(const location_base& rhs) const { return (hash() == rhs.hash()); } +}; + +uint64_t location_data::index_counter = 0; + +OTF2_TimeStamp +get_time() +{ + auto _ts = rocprofiler_timestamp_t{}; + rocprofiler_get_timestamp(&_ts); + return static_cast(_ts); +} + +auto& +get_locations() +{ + static auto _v = std::vector>{}; + return _v; +} + +const location_data* +get_location(const location_base& _location, bool _init = false) +{ + for(auto& itr : get_locations()) + if(*itr == _location) return itr.get(); + + if(_init) + return get_locations() + .emplace_back(std::make_unique( + _location.pid, _location.tid, _location.agent, _location.queue)) + .get(); + + return nullptr; +} + +event_writer_t* +get_event_writer(const location_base& _location, bool _init = false) +{ + const auto* _loc = get_location(_location, _init); + return (_loc) ? _loc->event_writer : nullptr; +} + +OTF2_FlushType +pre_flush(void* userData, + OTF2_FileType fileType, + OTF2_LocationRef location, + void* callerData, + bool fini) +{ + consume_variables(userData, fileType, location, callerData, fini); + return OTF2_FLUSH; +} + +OTF2_TimeStamp +post_flush(void* userData, OTF2_FileType fileType, OTF2_LocationRef location) +{ + consume_variables(userData, fileType, location); + return get_time(); +} + +template +size_t +get_hash_id(Tp&& _val) +{ + using value_type = common::mpl::unqualified_type_t; + + if constexpr(!std::is_pointer::value) + return std::hash{}(std::forward(_val)); + else if constexpr(std::is_same::value || + std::is_same::value) + return get_hash_id(std::string_view{_val}); + else + return get_hash_id(*_val); +} + +template +auto +add_event(std::string_view name, + const location_base& _location, + rocprofiler_callback_phase_t _phase, + OTF2_TimeStamp _ts, + attribute_list_t* _attributes = nullptr) +{ + auto* evt_writer = get_event_writer(_location, true); + auto _hash = get_hash_id(name); + + if(_phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + OTF2_CHECK(OTF2_EvtWriter_Enter(evt_writer, _attributes, _ts, _hash)) + else if(_phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + OTF2_CHECK(OTF2_EvtWriter_Leave(evt_writer, _attributes, _ts, _hash)) + else + ROCP_FATAL << "otf2::add_event phase is not enter or exit"; +} + +void +setup() +{ + namespace fs = common::filesystem; + + auto _filename = get_output_filename("results", std::string_view{}); + auto _filepath = fs::path{_filename}; + auto _name = _filepath.filename().string(); + auto _path = _filepath.parent_path().string(); + + if(fs::exists(_filepath)) fs::remove_all(_filepath); + + constexpr uint64_t evt_chunk_size = 2 * common::units::MB; + constexpr uint64_t def_chunk_size = 8 * common::units::MB; + + archive = OTF2_Archive_Open(_path.c_str(), + _name.c_str(), + OTF2_FILEMODE_WRITE, + evt_chunk_size, // event chunk size + def_chunk_size, // def chunk size + OTF2_SUBSTRATE_POSIX, + OTF2_COMPRESSION_NONE); + + OTF2_CHECK(OTF2_Archive_SetFlushCallbacks(archive, &flush_callbacks, nullptr)); + OTF2_CHECK(OTF2_Archive_SetSerialCollectiveCallbacks(archive)); + OTF2_CHECK(OTF2_Pthread_Archive_SetLockingCallbacks(archive, nullptr)); + OTF2_CHECK(OTF2_Archive_OpenEvtFiles(archive)); + + ROCP_ERROR << "Opened result file: " << _filename << ".oft2"; +} + +void +shutdown() +{ + OTF2_CHECK(OTF2_Archive_Close(archive)); +} + +struct event_info +{ + explicit event_info(location_base&& _loc) + : m_location{tool::get_location(std::forward(_loc), true)} + {} + + auto id() const { return m_location->index; } + auto hash() const { return m_location->hash(); } + const location_base* get_location() const { return m_location; } + + std::string name = {}; + uint64_t event_count = 0; + +private: + const location_data* m_location = nullptr; +}; + +template +attribute_list_t* +create_attribute_list() +{ + auto* _val = OTF2_AttributeList_New(); + + const auto* _name = sdk::perfetto_category::name; + auto _hash = get_hash_id(_name); + + auto _attr_value = OTF2_AttributeValue{}; + _attr_value.stringRef = _hash; + OTF2_AttributeList_AddAttribute(_val, 0, OTF2_TYPE_STRING, _attr_value); + + return _val; +} +} // namespace + +void +write_otf2(tool_table* tool_functions, + uint64_t pid, + const std::vector& agent_data, + std::deque* hip_api_data, + std::deque* hsa_api_data, + std::deque* kernel_dispatch_data, + std::deque* memory_copy_data, + std::deque* marker_api_data, + std::deque* /*scratch_memory_data*/) +{ + namespace sdk = ::rocprofiler::sdk; + + setup(); + + auto _app_ts = *tool_functions->tool_get_app_timestamps_fn(); + auto agents_map = std::unordered_map{}; + for(auto itr : agent_data) + agents_map.emplace(itr.id, itr); + + const auto kernel_sym_data = get_kernel_symbol_data(); + const auto buffer_names = sdk::get_buffer_tracing_names(); + auto tids = std::set{}; + auto agent_thread_ids = std::map>{}; + auto agent_queue_ids = + std::map>>{}; + + auto thread_event_info = std::map{}; + auto agent_memcpy_info = + std::map>{}; + auto agent_dispatch_info = + std::map>>{}; + + auto _get_agent = [&agent_data](rocprofiler_agent_id_t _id) -> const rocprofiler_agent_t* { + for(const auto& itr : agent_data) + if(_id == itr.id) return &itr; + return CHECK_NOTNULL(nullptr); + }; + + auto _get_kernel_sym_data = + [&kernel_sym_data]( + const rocprofiler_kernel_dispatch_info_t& _info) -> const kernel_symbol_data* { + for(const auto& kitr : kernel_sym_data) + if(kitr.kernel_id == _info.kernel_id) return &kitr; + return CHECK_NOTNULL(nullptr); + }; + + { + for(auto itr : *hsa_api_data) + tids.emplace(itr.thread_id); + for(auto itr : *hip_api_data) + tids.emplace(itr.thread_id); + for(auto itr : *marker_api_data) + tids.emplace(itr.thread_id); + + for(auto itr : *memory_copy_data) + agent_thread_ids[itr.thread_id].emplace(itr.dst_agent_id); + + for(auto itr : *kernel_dispatch_data) + agent_queue_ids[itr.thread_id][itr.dispatch_info.agent_id].emplace( + itr.dispatch_info.queue_id); + } + + { + for(auto itr : tids) + thread_event_info.emplace(itr, location_base{pid, itr}); + + for(const auto& [tid, itr] : agent_thread_ids) + for(auto agent : itr) + agent_memcpy_info[tid].emplace(agent, location_base{pid, tid, agent}); + + for(auto [tid, itr] : agent_queue_ids) + for(auto [agent, qitr] : itr) + for(auto queue : qitr) + agent_dispatch_info[tid][agent].emplace(queue, + location_base{pid, tid, agent, queue}); + } + + for(auto& [tid, evt] : thread_event_info) + { + evt.name = fmt::format("Thread {}", tid); + } + + for(auto& [tid, itr] : agent_memcpy_info) + { + for(auto& [agent, evt] : itr) + { + const auto* _agent = _get_agent(agent); + auto _type_name = std::string_view{"UNK"}; + if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) + _type_name = "CPU"; + else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + _type_name = "GPU"; + + evt.name = fmt::format( + "Thread {}, Copy to {} {}", tid, _type_name, _agent->logical_node_type_id); + } + } + + auto _queue_ids = std::map{}; + for(auto& [tid, itr] : agent_dispatch_info) + for(auto& [agent, qitr] : itr) + for(auto& [queue, evt] : qitr) + _queue_ids.emplace(queue, 0); + + { + uint64_t _n = 0; + for(auto& qitr : _queue_ids) + qitr.second = _n++; + } + + for(auto& [tid, itr] : agent_dispatch_info) + { + for(auto& [agent, qitr] : itr) + { + for(auto& [queue, evt] : qitr) + { + const auto* _agent = _get_agent(agent); + auto _type_name = std::string_view{"UNK"}; + if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) + _type_name = "CPU"; + else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + _type_name = "GPU"; + + evt.name = fmt::format("Thread {}, Compute on {} {}, Queue {}", + tid, + _type_name, + _agent->logical_node_type_id, + _queue_ids.at(queue)); + } + } + } + + auto _hash_data = hash_map_t{}; + + struct evt_data + { + rocprofiler_callback_phase_t phase = ROCPROFILER_CALLBACK_PHASE_NONE; + std::string_view name = {}; + const location_base* location = nullptr; + uint64_t timestamp = 0; + OTF2_AttributeList* attributes = nullptr; + }; + + auto _data = std::deque{}; + auto _attr_str = std::unordered_map{}; + auto get_attr = [&_attr_str](auto _category) { + using category_t = common::mpl::unqualified_type_t; + auto _name = sdk::perfetto_category::name; + _attr_str.emplace(get_hash_id(_name), _name); + return create_attribute_list(); + }; + + // trace events + { + auto callbk_name_info = sdk::get_callback_tracing_names(); + + auto add_event_data = [&buffer_names, + &_hash_data, + &_data, + &tool_functions, + &thread_event_info, + &get_attr](const auto* _inp, auto _attrib) { + if(!_inp) return; + for(auto itr : *_inp) + { + using value_type = common::mpl::unqualified_type_t; + auto name = buffer_names.at(itr.kind, itr.operation); + auto paradigm = OTF2_PARADIGM_HIP; + if constexpr(std::is_same::value) + { + paradigm = OTF2_PARADIGM_USER; + if(itr.kind == ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API && + itr.operation != ROCPROFILER_MARKER_CORE_API_ID_roctxGetThreadId) + name = tool_functions->tool_get_roctx_msg_fn(itr.correlation_id.internal); + } + + _hash_data.emplace( + get_hash_id(name), + region_info{std::string{name}, OTF2_REGION_ROLE_FUNCTION, paradigm}); + + auto& _evt_info = thread_event_info.at(itr.thread_id); + _evt_info.event_count += 1; + _data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_ENTER, + name, + _evt_info.get_location(), + itr.start_timestamp, + get_attr(_attrib)}); + _data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_EXIT, + name, + _evt_info.get_location(), + itr.end_timestamp, + nullptr}); + } + }; + + add_event_data(hsa_api_data, sdk::category::hsa_api{}); + add_event_data(hip_api_data, sdk::category::hip_api{}); + add_event_data(marker_api_data, sdk::category::marker_api{}); + } + + for(auto itr : *memory_copy_data) + { + auto name = buffer_names.at(itr.kind, itr.operation); + _hash_data.emplace( + get_hash_id(name), + region_info{std::string{name}, OTF2_REGION_ROLE_DATA_TRANSFER, OTF2_PARADIGM_HIP}); + + // TODO: add attributes for memory copy parameters + + auto& _evt_info = agent_memcpy_info.at(itr.thread_id).at(itr.dst_agent_id); + _evt_info.event_count += 1; + + _data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_ENTER, + name, + _evt_info.get_location(), + itr.start_timestamp, + get_attr(sdk::category::memory_copy{})}); + _data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_EXIT, + name, + _evt_info.get_location(), + itr.end_timestamp, + nullptr}); + } + + for(auto itr : *kernel_dispatch_data) + { + const auto& info = itr.dispatch_info; + const auto* sym = _get_kernel_sym_data(info); + CHECK(sym != nullptr); + + auto name = tool_functions->tool_get_kernel_name_fn(info.kernel_id, + itr.correlation_id.external.value); + _hash_data.emplace( + get_hash_id(name), + region_info{std::string{name}, OTF2_REGION_ROLE_FUNCTION, OTF2_PARADIGM_HIP}); + + // TODO: add attributes for kernel dispatch parameters + + auto& _evt_info = agent_dispatch_info.at(itr.thread_id).at(info.agent_id).at(info.queue_id); + _evt_info.event_count += 1; + + _data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_ENTER, + name, + _evt_info.get_location(), + itr.start_timestamp, + get_attr(sdk::category::kernel_dispatch{})}); + _data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_EXIT, + name, + _evt_info.get_location(), + itr.end_timestamp, + nullptr}); + } + + std::sort(_data.begin(), _data.end(), [](const evt_data& lhs, const evt_data& rhs) { + if(lhs.timestamp != rhs.timestamp) return (lhs.timestamp < rhs.timestamp); + if(lhs.phase != rhs.phase) return (lhs.phase > rhs.phase); + return (*lhs.location < *rhs.location); + }); + + for(const auto& itr : _data) + { + add_event(itr.name, *itr.location, itr.phase, itr.timestamp, itr.attributes); + ROCP_ERROR_IF(itr.timestamp < _app_ts.app_start_time) + << "event found with timestamp < app start time by " + << (_app_ts.app_start_time - itr.timestamp) << " nsec :: " << itr.name; + ROCP_ERROR_IF(itr.timestamp > _app_ts.app_end_time) + << "event found with timestamp > app end time by " + << (itr.timestamp - _app_ts.app_end_time) << " nsec :: " << itr.name; + } + + for(const auto& itr : _data) + { + if(itr.attributes) OTF2_AttributeList_Delete(itr.attributes); + } + + OTF2_CHECK(OTF2_Archive_CloseEvtFiles(archive)); + + OTF2_CHECK(OTF2_Archive_OpenDefFiles(archive)); + for(auto& itr : get_locations()) + { + OTF2_DefWriter* def_writer = OTF2_Archive_GetDefWriter(archive, itr->index); + OTF2_Archive_CloseDefWriter(archive, def_writer); + } + OTF2_CHECK(OTF2_Archive_CloseDefFiles(archive)); + + auto _timer_resolution = + common::get_clock_period_ns_impl(common::default_clock_id) * std::nano::den; + auto _global_offset = _app_ts.app_start_time; + auto _max_trace_length = (_app_ts.app_end_time - _app_ts.app_start_time); + + OTF2_GlobalDefWriter* global_def_writer = OTF2_Archive_GetGlobalDefWriter(archive); + OTF2_CHECK(OTF2_GlobalDefWriter_WriteClockProperties( + global_def_writer, + _timer_resolution, + _global_offset, + _max_trace_length, + std::chrono::system_clock::now().time_since_epoch().count())); + + OTF2_CHECK(OTF2_GlobalDefWriter_WriteString(global_def_writer, 0, "")); + for(const auto& itr : _hash_data) + { + if(itr.first != 0) + OTF2_CHECK(OTF2_GlobalDefWriter_WriteString( + global_def_writer, itr.first, itr.second.name.c_str())); + } + + for(const auto& itr : _hash_data) + { + if(itr.first != 0) + OTF2_CHECK(OTF2_GlobalDefWriter_WriteRegion(global_def_writer, + itr.first, + itr.first, + 0, + 0, + itr.second.region_role, + itr.second.paradigm, + OTF2_REGION_FLAG_NONE, + 0, + 0, + 0)); + } + + auto add_write_string = [&global_def_writer](size_t _hash, std::string_view _name) { + static auto _existing = std::unordered_set{}; + if(_hash > 0 && _existing.count(_hash) == 0) + { + OTF2_CHECK(OTF2_GlobalDefWriter_WriteString(global_def_writer, _hash, _name.data())); + _existing.emplace(_hash); + } + }; + + auto add_write_string_val = [&add_write_string](std::string_view _name_v) { + auto _hash_v = get_hash_id(_name_v); + add_write_string(_hash_v, _name_v); + return _hash_v; + }; + + auto _attr_name = std::string_view{"category"}; + auto _attr_desc = std::string_view{"tracing category"}; + + auto _attr_name_hash = add_write_string_val(_attr_name); + auto _attr_desc_hash = add_write_string_val(_attr_desc); + + OTF2_CHECK(OTF2_GlobalDefWriter_WriteAttribute( + global_def_writer, 0, _attr_name_hash, _attr_desc_hash, OTF2_TYPE_STRING)); + + for(const auto& itr : _attr_str) + add_write_string(itr.first, itr.second); + + auto _cmdline = common::read_command_line(pid); + auto _exe_name = (_cmdline.empty()) ? std::string{"??"} : _cmdline.at(0); + auto _exe_hash = get_hash_id(_exe_name); + add_write_string(_exe_hash, _exe_name); + + auto _node_name = std::string{"node"}; + { + char _hostname_c[PATH_MAX]; + if(::gethostname(_hostname_c, PATH_MAX) == 0 && ::strnlen(_hostname_c, PATH_MAX) < PATH_MAX) + _node_name = std::string{_hostname_c}; + } + auto _node_hash = get_hash_id(_node_name); + add_write_string(_node_hash, _node_name); + + OTF2_CHECK(OTF2_GlobalDefWriter_WriteSystemTreeNode( + global_def_writer, 0, _exe_hash, _node_hash, OTF2_UNDEFINED_SYSTEM_TREE_NODE)); + + // Process + OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocationGroup(global_def_writer, + 0, + _exe_hash, + OTF2_LOCATION_GROUP_TYPE_PROCESS, + 0, + OTF2_UNDEFINED_LOCATION_GROUP)); + + // Accelerators + for(const auto& agent_v : agent_data) + { + const auto* _name = agent_v.name; + auto _hash = get_hash_id(_name); + + add_write_string(_hash, _name); + OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocationGroup(global_def_writer, + agent_v.id.handle, + _hash, + OTF2_LOCATION_GROUP_TYPE_ACCELERATOR, + 0, + OTF2_UNDEFINED_LOCATION_GROUP)); + } + + // Thread Events + for(auto& [tid, evt] : thread_event_info) + { + auto _hash = get_hash_id(evt.name); + + add_write_string(_hash, evt.name); + OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer, + evt.id(), // id + _hash, + OTF2_LOCATION_TYPE_CPU_THREAD, + 2 * evt.event_count, // # events + 0 // location group + )); + } + + // Memcpy Events + for(auto& [tid, itr] : agent_memcpy_info) + { + for(auto& [agent, evt] : itr) + { + auto _hash = get_hash_id(evt.name); + + add_write_string(_hash, evt.name); + OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer, + evt.id(), // id + _hash, + OTF2_LOCATION_TYPE_ACCELERATOR_STREAM, + 2 * evt.event_count, // # events + agent.handle // location group + )); + } + } + + // Dispatch Events + for(auto& [tid, itr] : agent_dispatch_info) + { + for(auto& [agent, qitr] : itr) + { + for(auto& [queue, evt] : qitr) + { + auto _hash = get_hash_id(evt.name); + + add_write_string(_hash, evt.name); + OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer, + evt.id(), // id + _hash, + OTF2_LOCATION_TYPE_ACCELERATOR_STREAM, + 2 * evt.event_count, // # events + agent.handle // location group + )); + } + } + } + + shutdown(); +} + +} // namespace tool +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk-tool/generateOTF2.hpp b/source/lib/rocprofiler-sdk-tool/generateOTF2.hpp new file mode 100644 index 00000000..a70d2209 --- /dev/null +++ b/source/lib/rocprofiler-sdk-tool/generateOTF2.hpp @@ -0,0 +1,44 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "helper.hpp" + +#include + +namespace rocprofiler +{ +namespace tool +{ +void +write_otf2(tool_table* tool_functions, + uint64_t pid, + const std::vector& agent_data, + std::deque* hip_api_data, + std::deque* hsa_api_data, + std::deque* kernel_dispatch_data, + std::deque* memory_copy_data, + std::deque* marker_api_data, + std::deque* scratch_memory_data); +} // namespace tool +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk-tool/output_file.cpp b/source/lib/rocprofiler-sdk-tool/output_file.cpp index f54403d5..8d04e701 100644 --- a/source/lib/rocprofiler-sdk-tool/output_file.cpp +++ b/source/lib/rocprofiler-sdk-tool/output_file.cpp @@ -33,18 +33,11 @@ namespace tool { namespace fs = common::filesystem; -std::pair -get_output_stream(std::string_view fname, std::string_view ext) +std::string +get_output_filename(std::string_view fname, std::string_view ext) { auto cfg_output_path = tool::format(tool::get_config().output_path); - if(cfg_output_path == "stdout" || cfg_output_path == "STDOUT") - return {&std::cout, [](auto*&) {}}; - else if(cfg_output_path == "stderr" || cfg_output_path == "STDERR") - return {&std::cout, [](auto*&) {}}; - else if(cfg_output_path.empty()) - return {&std::clog, [](auto*&) {}}; - // add a period to provided file extension if necessary constexpr auto period = std::string_view{"."}; constexpr auto noperiod = std::string_view{}; @@ -60,10 +53,22 @@ get_output_stream(std::string_view fname, std::string_view ext) output_path.string())}; if(!fs::exists(output_path)) fs::create_directories(output_path); - auto output_file = - tool::format(output_path / fmt::format("{}_{}{}", output_prefix, fname, _ext)); + return tool::format(output_path / fmt::format("{}_{}{}", output_prefix, fname, _ext)); +} +std::pair +get_output_stream(std::string_view fname, std::string_view ext) +{ + auto cfg_output_path = tool::format(tool::get_config().output_path); + + if(cfg_output_path == "stdout" || cfg_output_path == "STDOUT") + return {&std::cout, [](auto*&) {}}; + else if(cfg_output_path == "stderr" || cfg_output_path == "STDERR") + return {&std::cout, [](auto*&) {}}; + else if(cfg_output_path.empty()) + return {&std::clog, [](auto*&) {}}; - auto* _ofs = new std::ofstream{output_file}; + auto output_file = get_output_filename(fname, ext); + auto* _ofs = new std::ofstream{output_file}; LOG_IF(FATAL, !_ofs && !*_ofs) << fmt::format("Failed to open {} for output", output_file); ROCP_ERROR << "Opened result file: " << output_file; diff --git a/source/lib/rocprofiler-sdk-tool/output_file.hpp b/source/lib/rocprofiler-sdk-tool/output_file.hpp index fa34dbcb..40e08a9c 100644 --- a/source/lib/rocprofiler-sdk-tool/output_file.hpp +++ b/source/lib/rocprofiler-sdk-tool/output_file.hpp @@ -43,6 +43,9 @@ namespace tool { using output_stream_dtor_t = void (*)(std::ostream*&); +std::string +get_output_filename(std::string_view fname, std::string_view ext); + std::pair get_output_stream(std::string_view fname, std::string_view ext); diff --git a/source/lib/rocprofiler-sdk-tool/tool.cpp b/source/lib/rocprofiler-sdk-tool/tool.cpp index ba44a505..aef5c231 100644 --- a/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -26,6 +26,7 @@ #include "domain_type.hpp" #include "generateCSV.hpp" #include "generateJSON.hpp" +#include "generateOTF2.hpp" #include "generatePerfetto.hpp" #include "helper.hpp" #include "output_file.hpp" @@ -1249,8 +1250,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) { client_finalizer = fini_func; - const uint64_t buffer_size = 32 * common::units::get_page_size(); - const uint64_t buffer_watermark = 31 * common::units::get_page_size(); + constexpr uint64_t buffer_size = 32 * common::units::KiB; + constexpr uint64_t buffer_watermark = 31 * common::units::KiB; rocprofiler_get_timestamp(&(stats_timestamp->app_start_time)); @@ -1528,6 +1529,7 @@ tool_fini(void* /*tool_data*/) rocprofiler_get_timestamp(&(stats_timestamp->app_end_time)); + flush(); rocprofiler_stop_context(get_client_ctx()); flush(); @@ -1605,6 +1607,19 @@ tool_fini(void* /*tool_data*/) &scratch_memory_output.element_data); } + if(tool::get_config().otf2_output) + { + rocprofiler::tool::write_otf2(tool_functions, + getpid(), + _agents, + &hip_output.element_data, + &hsa_output.element_data, + &kernel_dispatch_output.element_data, + &memory_copy_output.element_data, + &marker_output.element_data, + &scratch_memory_output.element_data); + } + auto destroy_output = [](auto& _buffered_output_v) { _buffered_output_v.destroy(); }; destroy_output(kernel_dispatch_output); diff --git a/tests/pytest-packages/CMakeLists.txt b/tests/pytest-packages/CMakeLists.txt index fc347a10..9a90f056 100644 --- a/tests/pytest-packages/CMakeLists.txt +++ b/tests/pytest-packages/CMakeLists.txt @@ -5,7 +5,7 @@ set(PACKAGE_OUTPUT_DIR ${ROCPROFILER_SDK_TESTS_BINARY_DIR}/pytest-packages/rocprofiler_sdk) -set(PYTEST_UTILS_PYTHON_SOURCES __init__.py dotdict.py perfetto_reader.py) +set(PYTEST_UTILS_PYTHON_SOURCES __init__.py dotdict.py perfetto_reader.py otf2_reader.py) set(TESTS_PYTHON_SOURCES __init__.py rocprofv3.py) foreach(_FILE ${PYTEST_UTILS_PYTHON_SOURCES}) diff --git a/tests/pytest-packages/pytest_utils/otf2_reader.py b/tests/pytest-packages/pytest_utils/otf2_reader.py new file mode 100644 index 00000000..740816fa --- /dev/null +++ b/tests/pytest-packages/pytest_utils/otf2_reader.py @@ -0,0 +1,201 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import absolute_import + +import otf2 +import pandas as pd + +from otf2.events import Enter, Leave + + +class Region(object): + """ """ + + def __init__(self, _enter, _leave, _depth, _location=None): + if _enter.region != _leave.region: + _location_info = f"\n{_location}" if _location else "" + raise ValueError( + f"enter region != leave region :: '{_enter}' != '{_leave}'{_location_info}" + ) + + if _depth < 0: + _location_info = f". location: '{_location}'" if _location else "" + raise ValueError( + f"negative depth ({_depth})! enter: '{_enter}'. leave: '{_leave}'{_location_info}" + ) + + self.region = _enter.region + self.depth = _depth + self.name = _enter.region.name + self.attributes = [ + itr for itr in [_enter.attributes, _leave.attributes] if itr is not None + ] + self.enter_nsec = _enter.time + self.leave_nsec = _leave.time + self.delta_nsec = _leave.time - _enter.time + if self.delta_nsec < 0: + raise ValueError( + f"negative timestamp delta :: '{_enter.time}' > '{_leave.time}'" + ) + + for itr in self.attributes: + for key, val in itr.items(): + _key = f"{key.name}" + if not hasattr(self, _key): + setattr(self, _key, val) + + if not hasattr(self, "category"): + self.category = "unk" + + def __str__(self): + return f"{self.name:<35} :: {self.delta_nsec} nsec" + + +class OTF2Reader: + """Read in perfetto protobuf output""" + + def __init__(self, filename): + self.filename = filename if isinstance(filename, (list, tuple)) else [filename] + + def read(self): + + def _read_trace(trace_name): + trace = otf2.reader.Reader(trace_name) + # print(f"Read {len(trace.definitions.strings)} string definitions") + # for string in trace.definitions.strings: + # print(f"String definition with value '{string}' in trace.") + # print("Read {} events".format(len(trace.events))) + + events = [[loc, evt] for loc, evt in trace.events] + locations = [itr for itr in trace.definitions.locations] + location_groups = [itr for itr in trace.definitions.location_groups] + system_tree_nodes = [itr for itr in trace.definitions.system_tree_nodes] + + call_stack = {} + partial_call_stack = {} + + for itr in system_tree_nodes: + call_stack[itr] = {} + partial_call_stack[itr] = {} + + for itr in location_groups: + call_stack[itr.system_tree_parent][itr] = {} + partial_call_stack[itr.system_tree_parent][itr] = {} + + for itr in locations: + call_stack[itr.group.system_tree_parent][itr.group][itr] = [] + partial_call_stack[itr.group.system_tree_parent][itr.group][itr] = [] + + for location, event in events: + _stree = location.group.system_tree_parent + _group = location.group + _partial = partial_call_stack[_stree][_group][location] + if isinstance(event, Enter): + # expected length + _elen = len(_partial) + 1 + _partial.append(event) + elif isinstance(event, Leave): + # expected length + _elen = len(_partial) - 1 + _depth = len(_partial) + _leave = event + + # it appears that on MI300, the end of A may exceed the + # begin of B kernels very slightly (i.e. overlap in same + # stream/queue). This leads to slightly out of order + # Enter/Leave regions and thus we need to occasionally + # search further back in the callstack to find the correct + # Enter region + _enter = _partial[-1] + if _enter.region == _leave.region: + _partial.pop() + else: + for ridx, ritr in enumerate(reversed(_partial)): + if ritr.region == _leave.region: + _enter = _partial.pop(len(_partial) - ridx - 1) + break + + # below is what is expected on non-MI300 + # _enter = _partial.pop() + + # add the region + call_stack[_stree][_group][location].append( + Region(_enter, _leave, _depth - 1, location) + ) + + # modified length + _mlen = len(partial_call_stack[_stree][_group][location]) + # if modified length != expected length + if _mlen != _elen: + raise RuntimeError( + f"Modified length ({_mlen}) != Expected length({_elen}) for {event} at {location}" + ) + + data = { + "system_tree_node": [], + "location_group": [], + "location": [], + "region": [], + "attributes": [], + "depth": [], + "name": [], + "category": [], + "start_ts": [], + "end_ts": [], + } + + for tree, lgitr in call_stack.items(): + for group, gitr in lgitr.items(): + for loc, ritr in gitr.items(): + for region in ritr: + data["system_tree_node"] += [tree] + data["location_group"] += [group] + data["location"] += [loc] + data["region"] += [region.region] + data["attributes"] += [region.attributes] + data["depth"] += [region.depth] + data["category"] += [region.category] + data["name"] += [region.name] + data["start_ts"] += [region.enter_nsec] + data["end_ts"] += [region.leave_nsec] + + return (trace, pd.DataFrame.from_dict(data)) + + readers = [] + df = pd.DataFrame() + for itr in self.filename: + _reader, _df = _read_trace(itr) + readers += [_reader] + df = pd.concat([df, _df]) + + return (df, readers) + + +def read_trace(filename): + data = OTF2Reader(filename).read()[0] + + print(f"\nDATA:\n{data}") + + attributes = list(data["attributes"]) + + print(f"\nATTRIBUTES:\n{attributes}") diff --git a/tests/pytest-packages/tests/rocprofv3.py b/tests/pytest-packages/tests/rocprofv3.py index 4d0221c3..d9751538 100644 --- a/tests/pytest-packages/tests/rocprofv3.py +++ b/tests/pytest-packages/tests/rocprofv3.py @@ -48,3 +48,30 @@ def test_perfetto_data( assert len(_pf_data) == len( _js_data ), f"{pf_category} ({len(_pf_data)}):\n\t{_pf_data}\n{js_category} ({len(_js_data)}):\n\t{_js_data}" + + +def test_otf2_data( + otf2_data, json_data, categories=("hip", "hsa", "marker", "kernel", "memory_copy") +): + + mapping = { + "hip": ("hip_api", "hip_api"), + "hsa": ("hsa_api", "hsa_api"), + "marker": ("marker_api", "marker_api"), + "kernel": ("kernel_dispatch", "kernel_dispatch"), + "memory_copy": ("memory_copy", "memory_copy"), + } + + # make sure they specified valid categories + for itr in categories: + assert itr in mapping.keys() + + for otf2_category, json_category in [ + itr for key, itr in mapping.items() if key in categories + ]: + _otf2_data = otf2_data.loc[otf2_data["category"] == otf2_category] + _json_data = json_data["rocprofiler-sdk-tool"]["buffer_records"][json_category] + + assert len(_otf2_data) == len( + _json_data + ), f"{otf2_category} ({len(_otf2_data)}):\n\t{_otf2_data}\n{json_category} ({len(_json_data)}):\n\t{_json_data}" diff --git a/tests/rocprofv3/kernel-rename/CMakeLists.txt b/tests/rocprofv3/kernel-rename/CMakeLists.txt index bc726901..00868131 100644 --- a/tests/rocprofv3/kernel-rename/CMakeLists.txt +++ b/tests/rocprofv3/kernel-rename/CMakeLists.txt @@ -36,7 +36,7 @@ add_test( $ -M --sys-trace no --hsa-trace=0 --hsa-core-trace=1 --hip-compiler-trace False --hip-runtime-trace --kernel-trace --memory-copy-trace -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace-cmd-line -o out - --output-format pftrace json --log-level env --kernel-rename + --output-format pftrace json otf2 --log-level env --kernel-rename --perfetto-shmem-size-hint 128 --perfetto-buffer-size 2048000 --perfetto-buffer-fill-policy ring_buffer --perfetto-backend inprocess -- $) @@ -52,7 +52,9 @@ add_test( ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.json --pftrace-input - ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.pftrace) + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.pftrace + --otf2-input + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.otf2) set(VALIDATION_FILES ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.pftrace @@ -94,7 +96,9 @@ add_test( ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.json --pftrace-input - ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.pftrace) + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.pftrace + --otf2-input + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.otf2) set(VALIDATION_FILES ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.pftrace diff --git a/tests/rocprofv3/kernel-rename/conftest.py b/tests/rocprofv3/kernel-rename/conftest.py index 1f9db6b2..0efcd2c4 100644 --- a/tests/rocprofv3/kernel-rename/conftest.py +++ b/tests/rocprofv3/kernel-rename/conftest.py @@ -2,10 +2,12 @@ import pytest import json +import os from rocprofiler_sdk.pytest_utils.dotdict import dotdict from rocprofiler_sdk.pytest_utils import collapse_dict_list from rocprofiler_sdk.pytest_utils.perfetto_reader import PerfettoReader +from rocprofiler_sdk.pytest_utils.otf2_reader import OTF2Reader def pytest_addoption(parser): @@ -19,6 +21,11 @@ def pytest_addoption(parser): action="store", help="Path to Perfetto trace file.", ) + parser.addoption( + "--otf2-input", + action="store", + help="Path to OTF2 trace file.", + ) @pytest.fixture @@ -32,3 +39,11 @@ def json_data(request): def pftrace_data(request): filename = request.config.getoption("--pftrace-input") return PerfettoReader(filename).read()[0] + + +@pytest.fixture +def otf2_data(request): + filename = request.config.getoption("--otf2-input") + if not os.path.exists(filename): + raise FileExistsError(f"{filename} does not exist") + return OTF2Reader(filename).read()[0] diff --git a/tests/rocprofv3/kernel-rename/input-kernel-rename.yml b/tests/rocprofv3/kernel-rename/input-kernel-rename.yml index 08e42559..4e10b41c 100644 --- a/tests/rocprofv3/kernel-rename/input-kernel-rename.yml +++ b/tests/rocprofv3/kernel-rename/input-kernel-rename.yml @@ -11,7 +11,7 @@ jobs: marker_trace: False output_directory: "@CMAKE_CURRENT_BINARY_DIR@/%argt%-trace-inp-yaml" output_file: out - output_format: [pftrace, json] + output_format: [pftrace, json, otf2] log_level: env kernel_rename: True perfetto_shmem_size_hint: 128 diff --git a/tests/rocprofv3/kernel-rename/validate.py b/tests/rocprofv3/kernel-rename/validate.py index b4dd1d53..c3b60f5a 100644 --- a/tests/rocprofv3/kernel-rename/validate.py +++ b/tests/rocprofv3/kernel-rename/validate.py @@ -143,6 +143,14 @@ def test_perfetto_data(pftrace_data, json_data): ) +def test_otf2_data(otf2_data, json_data): + import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 + + rocprofv3.test_otf2_data( + otf2_data, json_data, ("hip", "hsa", "kernel", "memory_copy") + ) + + if __name__ == "__main__": exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) sys.exit(exit_code) diff --git a/tests/rocprofv3/tracing-hip-in-libraries/CMakeLists.txt b/tests/rocprofv3/tracing-hip-in-libraries/CMakeLists.txt index 7582197a..660cd0e4 100644 --- a/tests/rocprofv3/tracing-hip-in-libraries/CMakeLists.txt +++ b/tests/rocprofv3/tracing-hip-in-libraries/CMakeLists.txt @@ -10,75 +10,31 @@ project( find_package(rocprofiler-sdk REQUIRED) -add_test( - NAME rocprofv3-test-trace-hip-in-libraries-execute - COMMAND - $ --hip-runtime-trace --hsa-core-trace - --hsa-amd-trace --marker-trace --kernel-trace --memory-copy-trace --stats - --output-format csv -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace -o out -- - $) - -add_test( - NAME rocprofv3-test-trace-hip-in-libraries-json-execute - COMMAND - $ --hip-runtime-trace --hsa-core-trace - --hsa-amd-trace --marker-trace --kernel-trace --memory-copy-trace --stats - --output-format JSON -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace -o out -- - $) - -add_test( - NAME rocprofv3-test-trace-hip-in-libraries-pftrace-execute - COMMAND - $ --hip-runtime-trace --hsa-core-trace - --hsa-amd-trace --marker-trace --kernel-trace --memory-copy-trace --stats - --output-format pftrace -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace -o out -- - $) +rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py) string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") set(tracing-env "${PRELOAD_ENV}") -set_tests_properties( - rocprofv3-test-trace-hip-in-libraries-execute - PROPERTIES - TIMEOUT - 100 - LABELS - "integration-tests" - ENVIRONMENT - "${tracing-env}" - FAIL_REGULAR_EXPRESSION - "HSA_CORE_API|HSA_AMD_EXT_API|HSA_IMAGE_EXT_API|HSA_FINALIZER_EXT_API|HIP_API|HIP_COMPILER_API|KERNEL_DISPATCH|CODE_OBJECT" - ) +set(VALIDATION_DEPENDS) +foreach(_OUTPUT_FORMAT csv json pftrace otf2) + add_test( + NAME rocprofv3-test-trace-hip-in-libraries-${_OUTPUT_FORMAT}-execute + COMMAND + $ --hip-runtime-trace + --hsa-core-trace --hsa-amd-trace --marker-trace --kernel-trace + --memory-copy-trace --stats --output-format ${_OUTPUT_FORMAT} -d + ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace -o out -- + $) -set_tests_properties( - rocprofv3-test-trace-hip-in-libraries-json-execute - PROPERTIES - TIMEOUT - 100 - LABELS - "integration-tests" - ENVIRONMENT - "${tracing-env}" - FAIL_REGULAR_EXPRESSION - "HSA_CORE_API|HSA_AMD_EXT_API|HSA_IMAGE_EXT_API|HSA_FINALIZER_EXT_API|HIP_API|HIP_COMPILER_API|KERNEL_DISPATCH|CODE_OBJECT" - ) + set_tests_properties( + rocprofv3-test-trace-hip-in-libraries-${_OUTPUT_FORMAT}-execute + PROPERTIES TIMEOUT 100 LABELS "integration-tests" ENVIRONMENT "${tracing-env}") -set_tests_properties( - rocprofv3-test-trace-hip-in-libraries-pftrace-execute - PROPERTIES - TIMEOUT - 100 - LABELS - "integration-tests" - ENVIRONMENT - "${tracing-env}" - FAIL_REGULAR_EXPRESSION - "HSA_CORE_API|HSA_AMD_EXT_API|HSA_IMAGE_EXT_API|HSA_FINALIZER_EXT_API|HIP_API|HIP_COMPILER_API|KERNEL_DISPATCH|CODE_OBJECT" - ) - -rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py) + list(APPEND VALIDATION_DEPENDS + rocprofv3-test-trace-hip-in-libraries-${_OUTPUT_FORMAT}-execute) +endforeach() add_test( NAME rocprofv3-test-trace-hip-in-libraries-validate @@ -103,7 +59,8 @@ add_test( ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_memory_copy_stats.csv --json-input ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_results.json --pftrace-input - ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_results.pftrace) + ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_results.pftrace + --otf2-input ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_results.otf2) set(VALIDATION_FILES ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_results.pftrace @@ -120,14 +77,13 @@ set(VALIDATION_FILES set_tests_properties( rocprofv3-test-trace-hip-in-libraries-validate - PROPERTIES - TIMEOUT - 45 - LABELS - "integration-tests" - DEPENDS - "rocprofv3-test-trace-hip-in-libraries-execute;rocprofv3-test-trace-hip-in-libraries-json-execute;rocprofv3-test-trace-hip-in-libraries-pftrace-execute" - FAIL_REGULAR_EXPRESSION - "AssertionError" - ATTACHED_FILES_ON_FAIL - "${VALIDATION_FILES}") + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests" + DEPENDS + "${VALIDATION_DEPENDS}" + FAIL_REGULAR_EXPRESSION + "AssertionError" + ATTACHED_FILES_ON_FAIL + "${VALIDATION_FILES}") diff --git a/tests/rocprofv3/tracing-hip-in-libraries/conftest.py b/tests/rocprofv3/tracing-hip-in-libraries/conftest.py index 2d1bef6a..e6d5f2b9 100644 --- a/tests/rocprofv3/tracing-hip-in-libraries/conftest.py +++ b/tests/rocprofv3/tracing-hip-in-libraries/conftest.py @@ -8,6 +8,7 @@ from rocprofiler_sdk.pytest_utils.dotdict import dotdict from rocprofiler_sdk.pytest_utils import collapse_dict_list from rocprofiler_sdk.pytest_utils.perfetto_reader import PerfettoReader +from rocprofiler_sdk.pytest_utils.otf2_reader import OTF2Reader def pytest_addoption(parser): @@ -61,7 +62,6 @@ def pytest_addoption(parser): action="store", help="Path to memory copy stats CSV file.", ) - parser.addoption( "--json-input", action="store", @@ -72,6 +72,11 @@ def pytest_addoption(parser): action="store", help="Path to Perfetto trace file.", ) + parser.addoption( + "--otf2-input", + action="store", + help="Path to OTF2 trace file.", + ) @pytest.fixture @@ -210,3 +215,9 @@ def json_data(request): def pftrace_data(request): filename = request.config.getoption("--pftrace-input") return PerfettoReader(filename).read()[0] + + +@pytest.fixture +def otf2_data(request): + filename = request.config.getoption("--otf2-input") + return OTF2Reader(filename).read()[0] diff --git a/tests/rocprofv3/tracing-hip-in-libraries/validate.py b/tests/rocprofv3/tracing-hip-in-libraries/validate.py index bfa48575..600d7733 100644 --- a/tests/rocprofv3/tracing-hip-in-libraries/validate.py +++ b/tests/rocprofv3/tracing-hip-in-libraries/validate.py @@ -438,6 +438,13 @@ def test_perfetto_data(pftrace_data, json_data): ) +def test_otf2_data(otf2_data, json_data): + import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 + + # do not test for HSA and HIP since that may vary slightly b/t two separate runs + rocprofv3.test_otf2_data(otf2_data, json_data, ("marker", "kernel", "memory_copy")) + + if __name__ == "__main__": exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) sys.exit(exit_code) From 4d7b8ece80081df641b5d7d3a854f1f6ed641af8 Mon Sep 17 00:00:00 2001 From: SrirakshaNag <104580803+SrirakshaNag@users.noreply.github.com> Date: Tue, 30 Jul 2024 22:40:44 -0500 Subject: [PATCH 2/7] fix iteration range and add tests (#993) * fix iteration range and add tests * addressing review comments on tests --- source/lib/rocprofiler-sdk-tool/config.cpp | 19 ++-- .../counter-collection/CMakeLists.txt | 1 + .../range_filtering/CMakeLists.txt | 49 +++++++++++ .../range_filtering/conftest.py | 87 +++++++++++++++++++ .../range_filtering/input_range.json | 37 ++++++++ .../range_filtering/pytest.ini | 5 ++ .../range_filtering/validate.py | 56 ++++++++++++ 7 files changed, 242 insertions(+), 12 deletions(-) create mode 100644 tests/rocprofv3/counter-collection/range_filtering/CMakeLists.txt create mode 100644 tests/rocprofv3/counter-collection/range_filtering/conftest.py create mode 100644 tests/rocprofv3/counter-collection/range_filtering/input_range.json create mode 100644 tests/rocprofv3/counter-collection/range_filtering/pytest.ini create mode 100644 tests/rocprofv3/counter-collection/range_filtering/validate.py diff --git a/source/lib/rocprofiler-sdk-tool/config.cpp b/source/lib/rocprofiler-sdk-tool/config.cpp index 65f8fdec..e3744d7a 100644 --- a/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/source/lib/rocprofiler-sdk-tool/config.cpp @@ -161,14 +161,14 @@ get_kernel_filter_range(const std::string& kernel_filter) { if(kernel_filter.empty()) return {}; - auto delim = rocprofiler::sdk::parse::tokenize(kernel_filter, ","); + auto delim = rocprofiler::sdk::parse::tokenize(kernel_filter, "[], "); auto range_set = std::unordered_set{}; for(const auto& itr : delim) { - if(itr.find('-') != std::string::npos && itr.find('[') != std::string::npos && - itr.find(']') != std::string::npos) + if(itr.find('-') != std::string::npos) { - auto drange = rocprofiler::sdk::parse::tokenize(itr, "[-] "); + auto drange = rocprofiler::sdk::parse::tokenize(itr, "- "); + ROCP_FATAL_IF(drange.size() != 2) << "bad range format for '" << itr << "'. Expected [A-B] where A and B are numbers"; @@ -179,14 +179,9 @@ get_kernel_filter_range(const std::string& kernel_filter) } else { - auto dval = rocprofiler::sdk::parse::tokenize(itr, " "); - ROCP_ERROR_IF(dval.empty()) << "kernel range value '" << itr << "' produced no numbers"; - for(const auto& ditr : dval) - { - ROCP_FATAL_IF(ditr.find_first_not_of("0123456789") != std::string::npos) - << "expected integer for " << itr << ". Non-integer value detected"; - range_set.emplace(std::stoul(ditr)); - } + ROCP_FATAL_IF(itr.find_first_not_of("0123456789") != std::string::npos) + << "expected integer for " << itr << ". Non-integer value detected"; + range_set.emplace(std::stoul(itr)); } } return range_set; diff --git a/tests/rocprofv3/counter-collection/CMakeLists.txt b/tests/rocprofv3/counter-collection/CMakeLists.txt index 1270ef5e..a320b7a9 100644 --- a/tests/rocprofv3/counter-collection/CMakeLists.txt +++ b/tests/rocprofv3/counter-collection/CMakeLists.txt @@ -7,3 +7,4 @@ add_subdirectory(input2) add_subdirectory(input3) add_subdirectory(list_metrics) add_subdirectory(kernel_filtering) +add_subdirectory(range_filtering) diff --git a/tests/rocprofv3/counter-collection/range_filtering/CMakeLists.txt b/tests/rocprofv3/counter-collection/range_filtering/CMakeLists.txt new file mode 100644 index 00000000..9fe274db --- /dev/null +++ b/tests/rocprofv3/counter-collection/range_filtering/CMakeLists.txt @@ -0,0 +1,49 @@ +# +# rocprofv3 tool test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-counter-collection + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py + input_range.json) + +add_test( + NAME rocprofv3-test-cc-kernel-filtering-range-filter-execute + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input_range.json -d + ${CMAKE_CURRENT_BINARY_DIR}/range_filter -- $ 1 15) + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +set(cc-env-kernel-filtering "${PRELOAD_ENV}") + +set_tests_properties( + rocprofv3-test-cc-kernel-filtering-range-filter-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT + "${cc-env-kernel-filtering}" FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +add_test( + NAME rocprofv3-test-cc-kernel-filtering-range-filter-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-config + ${CMAKE_CURRENT_BINARY_DIR}/input_range.json --input-json-pass1 + ${CMAKE_CURRENT_BINARY_DIR}/range_filter/pass_1/out_results.json + --input-json-pass2 + ${CMAKE_CURRENT_BINARY_DIR}/range_filter/pass_2/out_results.json + --input-json-pass3 + ${CMAKE_CURRENT_BINARY_DIR}/range_filter/pass_3/out_results.json) + +set_tests_properties( + rocprofv3-test-cc-kernel-filtering-range-filter-validate + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS + "rocprofv3-test-cc-kernel-filtering-range-filter-execute" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/tests/rocprofv3/counter-collection/range_filtering/conftest.py b/tests/rocprofv3/counter-collection/range_filtering/conftest.py new file mode 100644 index 00000000..430e5eeb --- /dev/null +++ b/tests/rocprofv3/counter-collection/range_filtering/conftest.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +import json +import pytest +import pandas as pd +import re + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list + + +def pytest_addoption(parser): + + parser.addoption( + "--input-json-pass1", + action="store", + help="Path to JSON file.", + ) + + parser.addoption( + "--input-json-pass2", + action="store", + help="Path to JSON file.", + ) + + parser.addoption( + "--input-json-pass3", + action="store", + help="Path to JSON file.", + ) + + parser.addoption( + "--json-config", + action="store", + help="Path to input JSON file.", + ) + + +def extract_iteration_list(jobs, pass_): + + kernel_iteration_range = jobs[pass_]["kernel_iteration_range"] + _range = re.split(r"\[|,|\],|\[|,|\]", kernel_iteration_range) + _range = list(filter(lambda itr: itr != "", _range)) + range_list = [] + for itr in _range: + if "-" in itr: + interval = re.split("-", itr) + range_list.append(list(range((int)(interval[0]), (int)(interval[1])))) + else: + + range_list.append(itr) + return range_list + + +def process_config(out_file, input_config, pass_): + + ret_dict = {} + + with open(out_file, "r") as inp: + ret_dict["json_data"] = dotdict(collapse_dict_list(json.load(inp))) + + with open(input_config, "r") as inp: + jobs = dotdict(collapse_dict_list(json.load(inp)))["jobs"] + ret_dict["iteration_range"] = extract_iteration_list(jobs, pass_) + + return ret_dict + + +@pytest.fixture +def input_json_pass1(request): + out_file = request.config.getoption("--input-json-pass1") + input_config = request.config.getoption("--json-config") + return process_config(out_file, input_config, 0) + + +@pytest.fixture +def input_json_pass2(request): + out_file = request.config.getoption("--input-json-pass2") + input_config = request.config.getoption("--json-config") + return process_config(out_file, input_config, 1) + + +@pytest.fixture +def input_json_pass3(request): + out_file = request.config.getoption("--input-json-pass3") + input_config = request.config.getoption("--json-config") + return process_config(out_file, input_config, 2) diff --git a/tests/rocprofv3/counter-collection/range_filtering/input_range.json b/tests/rocprofv3/counter-collection/range_filtering/input_range.json new file mode 100644 index 00000000..02e64102 --- /dev/null +++ b/tests/rocprofv3/counter-collection/range_filtering/input_range.json @@ -0,0 +1,37 @@ +{ + "jobs": [ + { + "pmc": [ + "SQ_WAVES" + ], + "kernel_iteration_range": "[1, 3, 5, [8-12]]", + "kernel_include_regex" : "transpose", + "output_file": "out", + "truncate_kernels": true, + "kernel_trace" : true, + "output_format" : ["json"] + }, + { + "pmc": [ + "SQ_WAVES" + ], + "kernel_iteration_range": "[11-14], [4-5]", + "output_file": "out", + "truncate_kernels": true, + "kernel_trace" : true, + "output_format" : ["json"] + }, + + { + "pmc": [ + "SQ_WAVES" + ], + "kernel_iteration_range": "[[3-6], 1, 13]", + "output_file": "out", + "truncate_kernels": true, + "kernel_trace" : true, + "output_format" : ["json"] + } + + ] +} diff --git a/tests/rocprofv3/counter-collection/range_filtering/pytest.ini b/tests/rocprofv3/counter-collection/range_filtering/pytest.ini new file mode 100644 index 00000000..5e1e1c14 --- /dev/null +++ b/tests/rocprofv3/counter-collection/range_filtering/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/tests/rocprofv3/counter-collection/range_filtering/validate.py b/tests/rocprofv3/counter-collection/range_filtering/validate.py new file mode 100644 index 00000000..3f875622 --- /dev/null +++ b/tests/rocprofv3/counter-collection/range_filtering/validate.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + +import sys +import pytest +import numpy as np +import pandas as pd +import re + + +def unique(lst): + return list(set(lst)) + + +def validate_json(input_data): + json_data = input_data["json_data"] + iteration_list = input_data["iteration_range"] + + data = json_data["rocprofiler-sdk-tool"] + counter_collection_data = data["callback_records"]["counter_collection"] + kernel_dispatch_data = data["buffer_records"]["kernel_dispatch"] + dispatch_ids = {} + + def get_kernel_name(kernel_id): + return data["kernel_symbols"][kernel_id]["formatted_kernel_name"] + + iteration = 1 + for dispatch in kernel_dispatch_data: + dispatch_info = dispatch["dispatch_info"] + kernel_name = get_kernel_name(dispatch_info["kernel_id"]) + if kernel_name == "transpose": + if iteration in iteration_list: + dispatch_ids[dispatch_info[dispatch_id]] = dispatch_info + iteration = iteration + 1 + + for counter in counter_collection_data: + dispatch_data = counter["dispatch_data"]["dispatch_info"] + dispatch_id = dispatch_data["dispatch_id"] + if dispatch_id in dispatch_ids.keys(): + assert dispatch_data == dispatch_ids[dispatch_id] + + +def test_validate_counter_collection_pass1(input_json_pass1): + validate_json(input_json_pass1) + + +def test_validate_counter_collection_pass2(input_json_pass2): + validate_json(input_json_pass2) + + +def test_validate_counter_collection_pass3(input_json_pass3): + validate_json(input_json_pass3) + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) From 018f3ce056761381840a507f76c8e7971be49264 Mon Sep 17 00:00:00 2001 From: Gopesh Bhardwaj Date: Wed, 31 Jul 2024 21:46:37 +0530 Subject: [PATCH 3/7] Bump up version to 0.5.0 for AFAR VI (#998) * Bump up version to 0.5.0 for AFAR VI * updating changlog * updating CI --- .github/workflows/continuous_integration.yml | 1 - CHANGELOG.md | 19 +++++++++++++++++++ VERSION | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index e202beb5..7204112a 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -180,7 +180,6 @@ jobs: timeout-minutes: 10 shell: bash run: | - python3 -m pip install -r requirements.txt sudo python3 -m pip install -r requirements.txt - name: List Files diff --git a/CHANGELOG.md b/CHANGELOG.md index ebf5ebab..72a0d510 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -74,3 +74,22 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/ ## Changes - rocprofv3 tool now needs `--` in front of application. For detailed uses, please [Click Here](source/docs/rocprofv3.md) + +## ROCprofiler-SDK for AFAR VI + +## Additions + +- OTF2 Tool Support +- Kernel and Range Filtering +- Counter Collection Definitions in YAML +- Documentation updates (SQ Block, Counter Collection, Tracing, Tool Usage) +- Added rocprofv3 option --kernel-rename +- Added rocprofv3 options for perfetto settings (buffer size, etc.) +- Added CSV columns for kernel trace + - Thread_Id + - Dispatch_Id +- Added CSV column for counter_collection + +## Fixes + +- Miscellaneous bug fixes diff --git a/VERSION b/VERSION index 1d0ba9ea..8f0916f7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.4.0 +0.5.0 From 94b5d9be3fe0fddc64262a92203e1c5b3f26ced7 Mon Sep 17 00:00:00 2001 From: SrirakshaNag <104580803+SrirakshaNag@users.noreply.github.com> Date: Wed, 31 Jul 2024 22:46:01 -0500 Subject: [PATCH 4/7] Adding changes for handling abort signals (#979) * Adding changes for handling abort signals * Fix the test failure * Fixing CmakeLists error * Addressing review comments * fixing warnings * fixing execute test * Fixing abort app test * Address review comments * Apply suggestions from code review * Apply suggestions from code review * Fixes for testing issues * Adding kernel filtering test * Removing text input file * fix formatting issues * misc fix * Suppress signal-unsafe error in ThreadSanitizer - rename signal handler to rocprofv3_error_signal_handler to ensure specific filtering * Fix rocprofv3 aborted-app validation --------- Co-authored-by: Jonathan R. Madsen Co-authored-by: Jonathan R. Madsen --- source/docs/how-to/using-rocprofv3.rst | 2 +- source/lib/rocprofiler-sdk-tool/tool.cpp | 31 +++++- source/scripts/thread-sanitizer-suppr.txt | 3 + tests/bin/vector-operations/vector-ops.cpp | 15 ++- tests/rocprofv3/CMakeLists.txt | 1 + tests/rocprofv3/aborted-app/CMakeLists.txt | 44 +++++++++ tests/rocprofv3/aborted-app/conftest.py | 23 +++++ tests/rocprofv3/aborted-app/input.json | 16 +++ tests/rocprofv3/aborted-app/pytest.ini | 5 + tests/rocprofv3/aborted-app/validate.py | 108 +++++++++++++++++++++ 10 files changed, 244 insertions(+), 4 deletions(-) create mode 100644 tests/rocprofv3/aborted-app/CMakeLists.txt create mode 100644 tests/rocprofv3/aborted-app/conftest.py create mode 100644 tests/rocprofv3/aborted-app/input.json create mode 100644 tests/rocprofv3/aborted-app/pytest.ini create mode 100644 tests/rocprofv3/aborted-app/validate.py diff --git a/source/docs/how-to/using-rocprofv3.rst b/source/docs/how-to/using-rocprofv3.rst index 39252e89..a13f7ee3 100644 --- a/source/docs/how-to/using-rocprofv3.rst +++ b/source/docs/how-to/using-rocprofv3.rst @@ -608,7 +608,7 @@ The following table lists the various fields or the columns in the output CSV fi Kernel Filtering +++++++++++++++++ -rocprofv3 supports kernel filtering. A kernel filter is a set of a regex string (to include the kernels matching this filter), a regex string (to exclude the kernels matching this filter), +rocprofv3 supports kernel filtering for profiling. A kernel filter is a set of a regex string (to include the kernels matching this filter), a regex string (to exclude the kernels matching this filter), and an iteration range (set of iterations of the included kernels). If the iteration range is not provided then all iterations of the included kernels are profiled. .. code-block:: shell diff --git a/source/lib/rocprofiler-sdk-tool/tool.cpp b/source/lib/rocprofiler-sdk-tool/tool.cpp index aef5c231..dfc839ab 100644 --- a/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -48,13 +48,16 @@ #include #include #include -#include #include #include #include + #include +#include #include +#include +#include #include #include #include @@ -1753,6 +1756,9 @@ get_main_function() static main_func_t user_main = nullptr; return user_main; } + +bool signal_handler_exit = + rocprofiler::tool::get_env("ROCPROF_INTERNAL_TEST_SIGNAL_HANDLER_VIA_EXIT", false); } // namespace #define ROCPROFV3_INTERNAL_API __attribute__((visibility("internal"))); @@ -1761,6 +1767,15 @@ extern "C" { void rocprofv3_set_main(main_func_t main_func) ROCPROFV3_INTERNAL_API; +void +rocprofv3_error_signal_handler(int signo) +{ + finalize_rocprofv3(); + // below is for testing purposes. re-raising the signal causes CTest to ignore WILL_FAIL ON + if(signal_handler_exit) ::exit(signo); + ::raise(signo); +} + int rocprofv3_main(int argc, char** argv, char** envp) ROCPROFV3_INTERNAL_API; @@ -1848,6 +1863,20 @@ rocprofv3_main(int argc, char** argv, char** envp) initialize_rocprofv3(); + struct sigaction sig_act = {}; + sigemptyset(&sig_act.sa_mask); + sig_act.sa_flags = SA_RESETHAND | SA_NODEFER; + sig_act.sa_handler = &rocprofv3_error_signal_handler; + for(auto signal_v : {SIGTERM, SIGSEGV, SIGINT, SIGILL, SIGABRT, SIGFPE}) + { + if(sigaction(signal_v, &sig_act, nullptr) != 0) + { + auto _errno_v = errno; + ROCP_ERROR << "error setting signal handler for " << signal_v + << " :: " << strerror(_errno_v); + } + } + auto ret = CHECK_NOTNULL(get_main_function())(argc, argv, envp); finalize_rocprofv3(); diff --git a/source/scripts/thread-sanitizer-suppr.txt b/source/scripts/thread-sanitizer-suppr.txt index 54d588d3..ea9a4bf2 100644 --- a/source/scripts/thread-sanitizer-suppr.txt +++ b/source/scripts/thread-sanitizer-suppr.txt @@ -27,3 +27,6 @@ mutex:external/ptl/source/PTL/TaskGroup.hh # lock order inversion that cannot happen mutex:source/lib/common/synchronized.hpp + +# signal-unsafe function called from signal handler +signal:rocprofv3_error_signal_handler diff --git a/tests/bin/vector-operations/vector-ops.cpp b/tests/bin/vector-operations/vector-ops.cpp index fdc76723..33ca7145 100644 --- a/tests/bin/vector-operations/vector-ops.cpp +++ b/tests/bin/vector-operations/vector-ops.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -217,6 +218,12 @@ run(int NUM_QUEUE, int DEVICE_ID) HIP_API_CALL(hipGetLastError()); + if(getenv("ROCPROF_TESTING_RAISE_SIGNAL") != nullptr && + std::stoi(getenv("ROCPROF_TESTING_RAISE_SIGNAL")) > 0) + { + ::raise(SIGINT); + } + hipLaunchKernelGGL(multiply_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), @@ -268,13 +275,17 @@ run(int NUM_QUEUE, int DEVICE_ID) } int -main() +main(int argc, char** argv) { + int stream_count = 8; int device_count = 0; HIP_API_CALL(hipGetDeviceCount(&device_count)); + if(argc > 1) stream_count = std::stoi(argv[1]); + if(argc > 2) device_count = std::stoi(argv[2]); + for(int i = 0; i < device_count; ++i) - run(8, i); + run(stream_count, i); return 0; } diff --git a/tests/rocprofv3/CMakeLists.txt b/tests/rocprofv3/CMakeLists.txt index 17e6e8a1..334b84d1 100644 --- a/tests/rocprofv3/CMakeLists.txt +++ b/tests/rocprofv3/CMakeLists.txt @@ -29,3 +29,4 @@ add_subdirectory(tracing-hip-in-libraries) add_subdirectory(counter-collection) add_subdirectory(hsa-queue-dependency) add_subdirectory(kernel-rename) +add_subdirectory(aborted-app) diff --git a/tests/rocprofv3/aborted-app/CMakeLists.txt b/tests/rocprofv3/aborted-app/CMakeLists.txt new file mode 100644 index 00000000..8b5e2702 --- /dev/null +++ b/tests/rocprofv3/aborted-app/CMakeLists.txt @@ -0,0 +1,44 @@ +# +# rocprofv3 tool test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-aborted-app + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py + input.json) + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +set(aborted-app-env "${PRELOAD_ENV}" ROCPROF_TESTING_RAISE_SIGNAL=1 + ROCPROF_INTERNAL_TEST_SIGNAL_HANDLER_VIA_EXIT=1) + +# app-abort +add_test( + NAME rocprofv3-test-execute-app-abort + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input.json -d + ${CMAKE_CURRENT_BINARY_DIR}/out-aborted-app -- $ 1 1) + +set_tests_properties( + rocprofv3-test-execute-app-abort + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${aborted-app-env}" + WILL_FAIL TRUE) + +add_test( + NAME rocprofv3-test-validate-app-abort + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input + ${CMAKE_CURRENT_BINARY_DIR}/out-aborted-app/pass_1/out_results.json) + +set_tests_properties( + rocprofv3-test-validate-app-abort + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS + "rocprofv3-test-execute-app-abort" FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/tests/rocprofv3/aborted-app/conftest.py b/tests/rocprofv3/aborted-app/conftest.py new file mode 100644 index 00000000..437dc547 --- /dev/null +++ b/tests/rocprofv3/aborted-app/conftest.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import json +import pytest +import pandas as pd + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list + + +def pytest_addoption(parser): + parser.addoption( + "--json-input", + action="store", + help="Path to JSON file.", + ) + + +@pytest.fixture +def json_data(request): + filename = request.config.getoption("--json-input") + with open(filename, "r") as inp: + return dotdict(collapse_dict_list(json.load(inp))) diff --git a/tests/rocprofv3/aborted-app/input.json b/tests/rocprofv3/aborted-app/input.json new file mode 100644 index 00000000..a01b2a40 --- /dev/null +++ b/tests/rocprofv3/aborted-app/input.json @@ -0,0 +1,16 @@ +{ + "jobs": [ + { + "pmc":["SQ_WAVES"], + "kernel_include_regex": "addition", + "kernel_exclude_regex": "subtract", + "hip_runtime_trace": true, + "kernel_trace": true, + "output_file": "out", + "output_format": [ + "json" + ], + "truncate_kernels": true + } + ] +} diff --git a/tests/rocprofv3/aborted-app/pytest.ini b/tests/rocprofv3/aborted-app/pytest.ini new file mode 100644 index 00000000..5e1e1c14 --- /dev/null +++ b/tests/rocprofv3/aborted-app/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/tests/rocprofv3/aborted-app/validate.py b/tests/rocprofv3/aborted-app/validate.py new file mode 100644 index 00000000..a4cfa2a4 --- /dev/null +++ b/tests/rocprofv3/aborted-app/validate.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +import sys +import pytest +import re + +kernel_trace_list = sorted(["addition_kernel", "subtract_kernel"]) +kernel_counter_list = ["addition_kernel"] + + +def unique(lst): + return list(set(lst)) + + +def test_counter_collection_json_data(json_data): + data = json_data["rocprofiler-sdk-tool"] + + def get_kernel_name(kernel_id): + return data["kernel_symbols"][kernel_id]["formatted_kernel_name"] + + counter_collection_data = data["callback_records"]["counter_collection"] + + for counter in counter_collection_data: + kernel_name = get_kernel_name(counter.dispatch_data.dispatch_info.kernel_id) + assert kernel_name in kernel_counter_list + + +def test_kernel_trace_json(json_data): + data = json_data["rocprofiler-sdk-tool"] + + def get_kernel_name(kernel_id): + return data["kernel_symbols"][kernel_id]["formatted_kernel_name"] + + def get_kind_name(kind_id): + return data["strings"]["buffer_records"][kind_id]["kind"] + + kernel_dispatch_data = data["buffer_records"]["kernel_dispatch"] + kernels = [] + assert len(kernel_dispatch_data) == 2 + for dispatch in kernel_dispatch_data: + dispatch_info = dispatch["dispatch_info"] + kernel_name = get_kernel_name(dispatch_info["kernel_id"]) + + assert get_kind_name(dispatch["kind"]) == "KERNEL_DISPATCH" + assert dispatch["correlation_id"]["internal"] > 0 + assert dispatch_info["agent_id"]["handle"] > 0 + assert dispatch_info["queue_id"]["handle"] > 0 + assert dispatch_info["kernel_id"] > 0 + if not re.search(r"__amd_rocclr_.*", kernel_name): + kernels.append(kernel_name) + + assert dispatch_info["workgroup_size"]["x"] == 64 + assert dispatch_info["workgroup_size"]["y"] == 1 + assert dispatch_info["workgroup_size"]["z"] == 1 + assert dispatch_info["grid_size"]["x"] == 1024 + assert dispatch_info["grid_size"]["y"] == 1024 + assert dispatch_info["grid_size"]["z"] == 1 + assert dispatch["end_timestamp"] >= dispatch["start_timestamp"] + + assert kernels == kernel_trace_list + + +def test_hip_api_trace_json(json_data): + data = json_data["rocprofiler-sdk-tool"] + + def get_operation_name(kind_id, op_id): + return data["strings"]["buffer_records"][kind_id]["operations"][op_id] + + def get_kind_name(kind_id): + return data["strings"]["buffer_records"][kind_id]["kind"] + + valid_domain_names = ("HIP_RUNTIME_API",) + + hip_api_data = data["buffer_records"]["hip_api"] + + functions = [] + for api in hip_api_data: + kind = get_kind_name(api["kind"]) + assert kind in valid_domain_names + assert api["end_timestamp"] >= api["start_timestamp"] + functions.append(get_operation_name(api["kind"], api["operation"])) + + expected_functions = ( + [ + "hipGetDeviceCount", + "hipSetDevice", + "hipDeviceSynchronize", + "hipStreamCreateWithFlags", + ] + + (["hipHostMalloc"] * 3) + + (["hipMallocAsync"] * 3) + + (["hipMemcpyAsync"] * 2) + + [ + "hipStreamSynchronize", + "hipDeviceSynchronize", + "hipLaunchKernel", + "hipGetLastError", + "hipLaunchKernel", + "hipGetLastError", + ] + ) + + assert functions == expected_functions + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) From 20e07caad46b121c118019c00085821b75f9c402 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 1 Aug 2024 00:10:09 -0500 Subject: [PATCH 5/7] Reorganize thread trace codeobj headers (#1001) * include/rocprofiler-sdk/cxx/codeobj - Relocated from include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj * Update include/rocprofiler-sdk/cxx - cmake updates - correct namespace rocprofiler::codeobj rocprofiler::sdk::codeobj * Update codeobj tests and samples --- samples/advanced_thread_trace/client.cpp | 8 +++--- samples/code_object_isa_decode/client.cpp | 6 ++--- .../rocprofiler-sdk/amd_detail/CMakeLists.txt | 2 -- .../rocprofiler-sdk-codeobj/CMakeLists.txt | 25 ------------------- .../rocprofiler-sdk/cxx/CMakeLists.txt | 1 + .../cxx/codeobj/CMakeLists.txt | 6 +++++ .../codeobj}/code_printing.hpp | 3 +++ .../codeobj}/disassembly.hpp | 3 +++ .../codeobj}/segment.hpp | 3 +++ .../tests/codeobj_library_test.cpp | 14 +++++------ tests/pc_sampling/address_translation.hpp | 8 +++--- tests/thread-trace/trace_callbacks.cpp | 6 ++--- 12 files changed, 37 insertions(+), 48 deletions(-) delete mode 100644 source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/CMakeLists.txt create mode 100644 source/include/rocprofiler-sdk/cxx/codeobj/CMakeLists.txt rename source/include/rocprofiler-sdk/{amd_detail/rocprofiler-sdk-codeobj => cxx/codeobj}/code_printing.hpp (99%) rename source/include/rocprofiler-sdk/{amd_detail/rocprofiler-sdk-codeobj => cxx/codeobj}/disassembly.hpp (99%) rename source/include/rocprofiler-sdk/{amd_detail/rocprofiler-sdk-codeobj => cxx/codeobj}/segment.hpp (98%) diff --git a/samples/advanced_thread_trace/client.cpp b/samples/advanced_thread_trace/client.cpp index ee4b787b..c35991ec 100644 --- a/samples/advanced_thread_trace/client.cpp +++ b/samples/advanced_thread_trace/client.cpp @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include "common/defines.hpp" @@ -106,9 +106,9 @@ namespace client using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; -using Instruction = rocprofiler::codeobj::disassembly::Instruction; -using CodeobjAddressTranslate = rocprofiler::codeobj::disassembly::CodeobjAddressTranslate; -using SymbolInfo = rocprofiler::codeobj::disassembly::SymbolInfo; +using Instruction = rocprofiler::sdk::codeobj::disassembly::Instruction; +using CodeobjAddressTranslate = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; +using SymbolInfo = rocprofiler::sdk::codeobj::disassembly::SymbolInfo; rocprofiler_client_id_t* client_id = nullptr; rocprofiler_context_id_t client_ctx = {}; diff --git a/samples/code_object_isa_decode/client.cpp b/samples/code_object_isa_decode/client.cpp index 255bbd85..3f3d176f 100644 --- a/samples/code_object_isa_decode/client.cpp +++ b/samples/code_object_isa_decode/client.cpp @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include "common/defines.hpp" #include "common/filesystem.hpp" @@ -90,8 +90,8 @@ using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_ using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; using kernel_symbol_map_t = std::unordered_map>; -using Instruction = rocprofiler::codeobj::disassembly::Instruction; -using CodeobjAddressTranslate = rocprofiler::codeobj::disassembly::CodeobjAddressTranslate; +using Instruction = rocprofiler::sdk::codeobj::disassembly::Instruction; +using CodeobjAddressTranslate = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; rocprofiler_client_id_t* client_id = nullptr; rocprofiler_client_finalize_t client_fini_func = nullptr; diff --git a/source/include/rocprofiler-sdk/amd_detail/CMakeLists.txt b/source/include/rocprofiler-sdk/amd_detail/CMakeLists.txt index 1e0bdd70..8d00d134 100644 --- a/source/include/rocprofiler-sdk/amd_detail/CMakeLists.txt +++ b/source/include/rocprofiler-sdk/amd_detail/CMakeLists.txt @@ -10,5 +10,3 @@ install( FILES ${ROCPROFILER_AMD_DETAIL_HEADER_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk/amd_detail/ COMPONENT development) - -add_subdirectory(rocprofiler-sdk-codeobj) diff --git a/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/CMakeLists.txt b/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/CMakeLists.txt deleted file mode 100644 index a997a7d0..00000000 --- a/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# ############################################################################## -# # Copyright (c) 2024 Advanced Micro Devices, Inc. # # Permission is hereby -# granted, free of charge, to any person obtaining a copy # of this software and -# associated documentation files (the "Software"), to # deal in the Software -# without restriction, including without limitation the # rights to use, copy, -# modify, merge, publish, distribute, sublicense, and/or # sell copies of the -# Software, and to permit persons to whom the Software is # furnished to do so, -# subject to the following conditions: # # The above copyright notice and this -# permission notice shall be included in # all copies or substantial portions of -# the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY -# KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO -# EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR -# OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE -# USE OR OTHER DEALINGS # IN THE SOFTWARE. -# ############################################################################## - -set(CODEOBJ_PARSER_HEADERS code_printing.hpp disassembly.hpp segment.hpp) - -install( - FILES ${CODEOBJ_PARSER_HEADERS} - DESTINATION - ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj - COMPONENT development) diff --git a/source/include/rocprofiler-sdk/cxx/CMakeLists.txt b/source/include/rocprofiler-sdk/cxx/CMakeLists.txt index da9ddc76..0360b725 100644 --- a/source/include/rocprofiler-sdk/cxx/CMakeLists.txt +++ b/source/include/rocprofiler-sdk/cxx/CMakeLists.txt @@ -11,4 +11,5 @@ install( DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk/cxx COMPONENT development) +add_subdirectory(codeobj) add_subdirectory(details) diff --git a/source/include/rocprofiler-sdk/cxx/codeobj/CMakeLists.txt b/source/include/rocprofiler-sdk/cxx/codeobj/CMakeLists.txt new file mode 100644 index 00000000..96473009 --- /dev/null +++ b/source/include/rocprofiler-sdk/cxx/codeobj/CMakeLists.txt @@ -0,0 +1,6 @@ +set(ROCPROFILER_CXX_CODEOBJ_HEADERS code_printing.hpp disassembly.hpp segment.hpp) + +install( + FILES ${ROCPROFILER_CXX_CODEOBJ_HEADERS} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk/cxx/codeobj + COMPONENT development) diff --git a/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/code_printing.hpp b/source/include/rocprofiler-sdk/cxx/codeobj/code_printing.hpp similarity index 99% rename from source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/code_printing.hpp rename to source/include/rocprofiler-sdk/cxx/codeobj/code_printing.hpp index 6beead3b..575544d6 100644 --- a/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/code_printing.hpp +++ b/source/include/rocprofiler-sdk/cxx/codeobj/code_printing.hpp @@ -40,6 +40,8 @@ namespace rocprofiler { +namespace sdk +{ namespace codeobj { namespace disassembly @@ -423,4 +425,5 @@ class CodeobjAddressTranslate : public CodeobjMap } // namespace disassembly } // namespace codeobj +} // namespace sdk } // namespace rocprofiler diff --git a/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/disassembly.hpp b/source/include/rocprofiler-sdk/cxx/codeobj/disassembly.hpp similarity index 99% rename from source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/disassembly.hpp rename to source/include/rocprofiler-sdk/cxx/codeobj/disassembly.hpp index 019cc7a7..ea7f9a63 100644 --- a/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/disassembly.hpp +++ b/source/include/rocprofiler-sdk/cxx/codeobj/disassembly.hpp @@ -70,6 +70,8 @@ namespace rocprofiler { +namespace sdk +{ namespace codeobj { namespace disassembly @@ -333,4 +335,5 @@ class DisassemblyInstance } // namespace disassembly } // namespace codeobj +} // namespace sdk } // namespace rocprofiler diff --git a/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/segment.hpp b/source/include/rocprofiler-sdk/cxx/codeobj/segment.hpp similarity index 98% rename from source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/segment.hpp rename to source/include/rocprofiler-sdk/cxx/codeobj/segment.hpp index bcc188f6..5a4ef6fd 100644 --- a/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/segment.hpp +++ b/source/include/rocprofiler-sdk/cxx/codeobj/segment.hpp @@ -31,6 +31,8 @@ namespace rocprofiler { +namespace sdk +{ namespace codeobj { namespace segment @@ -89,4 +91,5 @@ class CodeobjTableTranslator : public std::set } // namespace segment } // namespace codeobj +} // namespace sdk } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk-codeobj/tests/codeobj_library_test.cpp b/source/lib/rocprofiler-sdk-codeobj/tests/codeobj_library_test.cpp index fde0f6c8..463b5e91 100644 --- a/source/lib/rocprofiler-sdk-codeobj/tests/codeobj_library_test.cpp +++ b/source/lib/rocprofiler-sdk-codeobj/tests/codeobj_library_test.cpp @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include @@ -84,7 +84,7 @@ GetCodeobjContents() TEST(codeobj_library, segment_test) { - using CodeobjTableTranslator = rocprofiler::codeobj::segment::CodeobjTableTranslator; + using CodeobjTableTranslator = rocprofiler::sdk::codeobj::segment::CodeobjTableTranslator; CodeobjTableTranslator table; std::unordered_set used_addr{}; @@ -122,10 +122,10 @@ TEST(codeobj_library, segment_test) } } -namespace disassembly = rocprofiler::codeobj::disassembly; +namespace disassembly = rocprofiler::sdk::codeobj::disassembly; namespace codeobjhelper = rocprofiler::testing::codeobjhelper; -using CodeobjDecoderComponent = rocprofiler::codeobj::disassembly::CodeobjDecoderComponent; -using LoadedCodeobjDecoder = rocprofiler::codeobj::disassembly::LoadedCodeobjDecoder; +using CodeobjDecoderComponent = rocprofiler::sdk::codeobj::disassembly::CodeobjDecoderComponent; +using LoadedCodeobjDecoder = rocprofiler::sdk::codeobj::disassembly::LoadedCodeobjDecoder; TEST(codeobj_library, file_opens) { @@ -192,7 +192,7 @@ TEST(codeobj_library, loaded_codeobj_component) TEST(codeobj_library, codeobj_map_test) { - using marker_id_t = rocprofiler::codeobj::segment::marker_id_t; + using marker_id_t = rocprofiler::sdk::codeobj::segment::marker_id_t; const std::vector& objdata = rocprofiler::testing::codeobjhelper::GetCodeobjContents(); constexpr size_t laddr1 = 0x1000; @@ -221,7 +221,7 @@ TEST(codeobj_library, codeobj_map_test) TEST(codeobj_library, codeobj_table_test) { - using marker_id_t = rocprofiler::codeobj::segment::marker_id_t; + using marker_id_t = rocprofiler::sdk::codeobj::segment::marker_id_t; const std::vector& hiplines = codeobjhelper::GetHipccOutput(); const std::vector& objdata = codeobjhelper::GetCodeobjContents(); diff --git a/tests/pc_sampling/address_translation.hpp b/tests/pc_sampling/address_translation.hpp index 5e572f8c..30fc9b15 100644 --- a/tests/pc_sampling/address_translation.hpp +++ b/tests/pc_sampling/address_translation.hpp @@ -22,7 +22,7 @@ #pragma once -#include +#include #include #include @@ -38,9 +38,9 @@ namespace client { namespace address_translation { -using Instruction = rocprofiler::codeobj::disassembly::Instruction; -using CodeobjAddressTranslate = rocprofiler::codeobj::disassembly::CodeobjAddressTranslate; -using marker_id_t = rocprofiler::codeobj::disassembly::marker_id_t; +using Instruction = rocprofiler::sdk::codeobj::disassembly::Instruction; +using CodeobjAddressTranslate = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; +using marker_id_t = rocprofiler::sdk::codeobj::disassembly::marker_id_t; /** * @brief Pair (code_object_id, pc_addr) uniquely identifies an instruction. diff --git a/tests/thread-trace/trace_callbacks.cpp b/tests/thread-trace/trace_callbacks.cpp index 97a319e3..71812e8e 100644 --- a/tests/thread-trace/trace_callbacks.cpp +++ b/tests/thread-trace/trace_callbacks.cpp @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include "common.hpp" #include @@ -50,8 +50,8 @@ namespace Callbacks { using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; -using CodeobjAddressTranslate = rocprofiler::codeobj::disassembly::CodeobjAddressTranslate; -using Instruction = rocprofiler::codeobj::disassembly::Instruction; +using CodeobjAddressTranslate = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; +using Instruction = rocprofiler::sdk::codeobj::disassembly::Instruction; CodeobjAddressTranslate* codeobjTranslate = nullptr; From cfbac1964010c3f46f8bafcd55e694ae1d63926c Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 1 Aug 2024 02:59:35 -0500 Subject: [PATCH 6/7] Tracing Documentation (#997) * Update callback_services.md * Callback tracing services * Intercept table * Buffer tracing --- source/docs/buffered_services.md | 248 ++++++++++++++++++++++- source/docs/callback_services.md | 332 ++++++++++++++++++++++++++++++- source/docs/intercept_table.md | 95 ++++++++- 3 files changed, 668 insertions(+), 7 deletions(-) diff --git a/source/docs/buffered_services.md b/source/docs/buffered_services.md index dffea541..77d09027 100644 --- a/source/docs/buffered_services.md +++ b/source/docs/buffered_services.md @@ -2,12 +2,250 @@ For the buffered approach, supported buffer record categories are enumerated in `rocprofiler_buffer_category_t` category field. -## Buffered Tracing Services - ## Overview -In buffered approach, callbacks are receieved for batches of records from an internal (background) thread. Supported buffered tracing services are enumerated in `rocprofiler_buffer_tracing_kind_t`. +In buffered approach, callbacks are receieved for batches of records from an internal (background) thread. +Supported buffered tracing services are enumerated in `rocprofiler_buffer_tracing_kind_t`. Configuring +a buffer tracing service requires the creation of a buffer. When the buffer is "flushed", either implicitly +or explicitly, a callback to the tool will be invoked which provides an array of one or more buffer records. +A buffer can be explicitly flushed via the `rocprofiler_flush_buffer` function. + +## Subscribing to Buffer Tracing Services + +During tool initialization, tools configure callback tracing via the `rocprofiler_configure_buffer_tracing_service` +function. However, before invoking `rocprofiler_configure_buffer_tracing_service`, the tool must create a buffer +for the tracing records. + +### Creating a Buffer + +```cpp +rocprofiler_status_t +rocprofiler_create_buffer(rocprofiler_context_id_t context, + size_t size, + size_t watermark, + rocprofiler_buffer_policy_t policy, + rocprofiler_buffer_tracing_cb_t callback, + void* callback_data, + rocprofiler_buffer_id_t* buffer_id); +``` + +The `size` parameter is the size of the buffer in bytes and will be rounded up to the nearest +memory page size (defined by `sysconf(_SC_PAGESIZE)`); the default memory page size on Linux +is 4096 bytes (4 KB). + +The `watermark` parameter specifies the number of bytes at which +the buffer should be "flushed", i.e. when the records in the buffer should invoke the +`callback` parameter to deliver the records to the tool. For example, if a buffer has a size +of 4096 bytes and the watermark is set to 48 bytes, six 8-byte records can be placed in the +buffer before `callback` is invoked. However, every 64-byte record that is placed in the +buffer will trigger a flush. It is safe to set the `watermark` to any value between +zero and the buffer size. + +The `policy` parameter specifies the behavior for when a record is larger than the +amount of free space in the current buffer. For example, if a buffer has a size of +4000 bytes with a watermark set to 4000 bytes and 3998 of the bytes in the buffer +have been populated with records, the `policy` dictates how to handle an incoming record > +2 bytes. The `ROCPROFILER_BUFFER_POLICY_DISCARD` policy dictates that all records greater +than should 2 bytes should be dropped until the tool _explicitly_ flushes the buffer via +a `rocprofiler_flush_buffer` function call whereas the `ROCPROFILER_BUFFER_POLICY_LOSSLESS` +policy dictates that the current buffer should be swapped out for an empty buffer and placed +in that new buffer and former (full) buffer should be _implicitly_ flushed. + +The `callback` parameter is the function that rocprofiler-sdk should invoke when flushing +the buffer; the value of the `callback_data` parameter will be passed as one of the arguments +to the `callback` function. + +The `buffer_id` parameter is an output parameter for the function call and will have a +non-zero handle field after successful buffer creation. + +### Creating a Dedicated Thread for Buffer Callbacks + +By default, all buffers will use the same (default) background thread created by rocprofiler-sdk to +invoke their callback. However, rocprofiler-sdk provides an interface for tools to specify the +creation of an additional background thread for one or more of their buffers. + +Callback threads for buffers are created via the `rocprofiler_create_callback_thread` function: + +```cpp +rocprofiler_status_t +rocprofiler_create_callback_thread(rocprofiler_callback_thread_t* cb_thread_id); +``` + +Buffers are assigned to that callback thread via the `rocprofiler_assign_callback_thread` function: + +```cpp +rocprofiler_status_t +rocprofiler_assign_callback_thread(rocprofiler_buffer_id_t buffer_id, + rocprofiler_callback_thread_t cb_thread_id); +``` + +#### Buffer Callback Thread Creation and Assignment Example + +```cpp +{ + // create a context + auto context_id = rocprofiler_context_id_t{}; + rocprofiler_create_context(&context_id); + + // create a buffer associated with the context + auto buffer_id = rocprofiler_buffer_id_t{}; + rocprofiler_create_buffer(context_id, ..., &buffer_id); + + // specify that a new callback thread should be created and provide + // and assign the identifier for it to the "thr_id" variable + auto thr_id = rocprofiler_callback_thread_t{}; + rocprofiler_create_callback_thread(&thr_id); + + // assign the buffer callback to be delivered on this thread + rocprofiler_assign_callback_thread(buffer_id, thr_id); +} +``` + +### Configuring Buffer Tracing Services + +```cpp +rocprofiler_status_t +rocprofiler_configure_buffer_tracing_service(rocprofiler_context_id_t context_id, + rocprofiler_buffer_tracing_kind_t kind, + rocprofiler_tracing_operation_t* operations, + size_t operations_count, + rocprofiler_buffer_id_t buffer_id); +``` + +The `kind` parameter is a high-level specifier of which service to trace (also known as a "domain"). +Domain examples include, but are not limited to, the HIP API, the HSA API, and kernel dispatches. +For each domain, there are (often) various "operations", which can be used to restrict the callbacks +to a subset within the domain. For domains which correspond to APIs, the "operations" are the functions +which compose the API. If all operations in a domain should be traced, the `operations` and `operations_count` +parameters can be set to `nullptr` and `0`, respectively. If the tracing domain should be restricted to a subset +of operations, the tool library should specify a C-array of type `rocprofiler_tracing_operation_t` and the +size of the array for the `operations` and `operations_count` parameter. + +Similar to `rocprofiler_configure_callback_tracing_service`, +`rocprofiler_configure_buffer_tracing_service` will return an error if a buffer service for given context +and given domain is configured more than once. + +#### Example + +```cpp +{ + auto ctx = rocprofiler_context_id_t{}; + // ... creation of context, etc. ... + + // buffer parameters + constexpr auto KB = 1024; // 1024 bytes + constexpr auto buffer_size = 16 * KB; + constexpr auto watermark = 15 * KB; + constexpr auto policy = ROCPROFILER_BUFFER_POLICY_LOSSLESS; + + // buffer handle + auto buffer_id = rocprofiler_buffer_id_t{}; + + // create a buffer associated with the context + rocprofiler_create_buffer( + context_id, buffer_size, watermark, policy, callback_func, nullptr, &buffer_id); + + // configure HIP runtime API function records to be placed in buffer + rocprofiler_configure_buffer_tracing_service( + ctx, ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API, nullptr, 0, buffer_id); + + // configure kernel dispatch records to be placed in buffer + // (more than one service can use the same buffer) + rocprofiler_configure_buffer_tracing_service( + ctx, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, nullptr, 0, buffer_id); + + // ... etc. ... +} +``` + +## Buffer Tracing Callback Function + +Rocprofiler-sdk buffer tracing callback functions have the signature: + +```cpp +typedef void (*rocprofiler_buffer_tracing_cb_t)(rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* data, + uint64_t drop_count); +``` + +The `rocprofiler_record_header_t` data type provides three pieces of information: + +1. Category (`rocprofiler_buffer_category_t`) +2. Kind +3. Payload + +The category is used to distinguish the classification of the buffer record. For all +services configured via `rocprofiler_configure_buffer_tracing_service`, the category will +be equal to the value of `ROCPROFILER_BUFFER_CATEGORY_TRACING`. The meaning of the kind +field is dependent on the category but when the category is `ROCPROFILER_BUFFER_CATEGORY_TRACING`, +the kind value will be equivalent to the is used +to distinguish the `rocprofiler_buffer_tracing_kind_t` value passed to +`rocprofiler_configure_buffer_tracing_service`, e.g. `ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH`. +Once the category and kind have been determined, the payload can be casted: + +```cpp +{ + if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API) + { + auto* record = + static_cast(header->payload); + + // ... etc. ... + } +} +``` + +### Buffer Tracing Callback Function Example + +```cpp +void +buffer_callback_func(rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* user_data, + uint64_t drop_count) +{ + for(size_t i = 0; i < num_headers; ++i) + { + auto* header = headers[i]; + + if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API) + { + auto* record = + static_cast(header->payload); + + // ... etc. ... + } + else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) + { + auto* record = + static_cast(header->payload); + + // ... etc. ... + } + else + { + throw std::runtime_error{"unhandled record header category + kind"}; + } + } +} +``` + +## Buffer Tracing Record -## HSA API Tracing +Unlike callback tracing records, there is no common set of data for each buffer tracing record. However, +many buffer tracing records contain a `kind` field and an `operation` field. +The name of a tracing kind can be obtained via the `rocprofiler_query_buffer_tracing_kind_name` function. +The name of an operation specific to a tracing kind can be obtained via the `rocprofiler_query_buffer_tracing_kind_operation_name` +function. One can also iterate over all the buffer tracing kinds and operations for each tracing kind via the +`rocprofiler_iterate_buffer_tracing_kinds` and `rocprofiler_iterate_buffer_tracing_kind_operations` functions. -## Kernel Tracing +The buffer tracing record data types can be found in the `rocprofiler-sdk/buffer_tracing.h` header +(`source/include/rocprofiler-sdk/buffer_tracing.h` in the [rocprofiler-sdk GitHub repository](https://github.com/ROCm/rocproifler-sdk)). diff --git a/source/docs/callback_services.md b/source/docs/callback_services.md index 4cb2c43a..6744d9d4 100644 --- a/source/docs/callback_services.md +++ b/source/docs/callback_services.md @@ -2,6 +2,336 @@ ## Overview +Callback tracing services provide immediate callbacks to a tool on the current CPU thread when a given event occurs. +For example, when tracing an API function, e.g. `hipSetDevice`, callback tracing invokes a user-specified callback +before and after the traced function executes on the thread which is invoking the API function. + +## Subscribing to Callback Tracing Services + +During tool initialization, tools configure callback tracing via the `rocprofiler_configure_callback_tracing_service` +function: + +```cpp +rocprofiler_status_t +rocprofiler_configure_callback_tracing_service(rocprofiler_context_id_t context_id, + rocprofiler_callback_tracing_kind_t kind, + rocprofiler_tracing_operation_t* operations, + size_t operations_count, + rocprofiler_callback_tracing_cb_t callback, + void* callback_args); +``` + +The `kind` parameter is a high-level specifier of which service to trace (also known as a "domain"). +Domain examples include, but are not limited to, the HIP API, the HSA API, and kernel dispatches. +For each domain, there are (often) various "operations", which can be used to restrict the callbacks +to a subset within the domain. For domains which correspond to APIs, the "operations" are the functions +which compose the API. If all operations in a domain should be traced, the `operations` and `operations_count` +parameters can be set to `nullptr` and `0`, respectively. If the tracing domain should be restricted to a subset +of operations, the tool library should specify a C-array of type `rocprofiler_tracing_operation_t` and the +size of the array for the `operations` and `operations_count` parameter. + +`rocprofiler_configure_callback_tracing_service` will return an error if a callback service for given context +and given domain is configured more than once. For example, if one only wanted to trace two functions within +the HIP runtime API, `hipGetDevice` and `hipSetDevice`, the following code would accomplish this objective: + +```cpp +{ + auto ctx = rocprofiler_context_id_t{}; + // ... creation of context, etc. ... + + // array of operations (i.e. API functions) + auto operations = std::array{ + ROCPROFILER_HIP_RUNTIME_API_ID_hipSetDevice, + ROCPROFILER_HIP_RUNTIME_API_ID_hipGetDevice + }; + + rocprofiler_configure_callback_tracing_service(ctx, + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, + operations.data(), + operations.size(), + callback_func, + nullptr); + // ... etc. ... +} +``` + +But the following code would be invalid: + +```cpp +{ + auto ctx = rocprofiler_context_id_t{}; + // ... creation of context, etc. ... + + // array of operations (i.e. API functions) + auto operations = std::array{ + ROCPROFILER_HIP_RUNTIME_API_ID_hipSetDevice, + ROCPROFILER_HIP_RUNTIME_API_ID_hipGetDevice + }; + + for(auto op : operations) + { + // after the first iteration, will return ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED + rocprofiler_configure_callback_tracing_service(ctx, + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, + &op, + 1, + callback_func, + nullptr); + } + + // ... etc. ... +} +``` + +## Callback Tracing Callback Function + +Rocprofiler-sdk callback tracing callback functions have the signature: + +```cpp +typedef void (*rocprofiler_callback_tracing_cb_t)(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) +``` + +The `record` parameter contains the information to uniquely identify a tracing record type and has the +following definition: + +```cpp +typedef struct rocprofiler_callback_tracing_record_t +{ + rocprofiler_context_id_t context_id; + rocprofiler_thread_id_t thread_id; + rocprofiler_correlation_id_t correlation_id; + rocprofiler_callback_tracing_kind_t kind; + uint32_t operation; + rocprofiler_callback_phase_t phase; + void* payload; +} rocprofiler_callback_tracing_record_t; +``` + +The underlying type of `payload` field above is typically unique to a domain and, less frequently, an operation. +For example, for the `ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API` and `ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API`, +the payload should be casted to `rocprofiler_callback_tracing_hip_api_data_t*` -- which will contain the arguments +to the function and (in the exit phase) the return value of the function. The payload field will only be a valid +pointer during the invocation of the callback function(s). + +The `user_data` parameter can be used to store data in between callback phases. It is a unique for every +instance of an operation. For example, if the tool library wishes to store the timestamp of the +`ROCPROFILER_CALLBACK_PHASE_ENTER` phase for the ensuing `ROCPROFILER_CALLBACK_PHASE_EXIT` callback, +this data can be stored in a method similar to below: + +```cpp +void +callback_func(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* cb_data) +{ + auto ts = rocprofiler_timestamp_t{}; + rocprofiler_get_timestamp(&ts); + + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + { + user_data->value = ts; + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + { + auto delta_ts = (ts - user_data->value); + // ... etc. ... + } + else + { + // ... etc. ... + } +} +``` + +The `callback_data` argument will be the value of `callback_args` passed to `rocprofiler_configure_callback_tracing_service` +in [the previous section](#subscribing-to-callback-tracing-services). + +## Callback Tracing Record + +The name of a tracing kind can be obtained via the `rocprofiler_query_callback_tracing_kind_name` function. +The name of an operation specific to a tracing kind can be obtained via the `rocprofiler_query_callback_tracing_kind_operation_name` +function. One can also iterate over all the callback tracing kinds and operations for each tracing kind via the +`rocprofiler_iterate_callback_tracing_kinds` and `rocprofiler_iterate_callback_tracing_kind_operations` functions. +Lastly, for a given `rocprofiler_callback_tracing_record_t` object, rocprofiler-sdk supports generically iterating over +the arguments of the payload field for many domains. + +As mentioned above, within the `rocprofiler_callback_tracing_record_t` object, +an opaque `void* payload` is provided for accessing domain specific information. +The data types generally follow the naming convention of `rocprofiler_callback_tracing__data_t`, +e.g., for the tracing kinds `ROCPROFILER_BUFFER_TRACING_HSA_{CORE,AMD_EXT,IMAGE_EXT,FINALIZE_EXT}_API`, +the payload should be casted to `rocprofiler_callback_tracing_hsa_api_data_t*`: + +```cpp +void +callback_func(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* cb_data) +{ + static auto hsa_domains = std::unordered_set{ + ROCPROFILER_BUFFER_TRACING_HSA_CORE_API, + ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API, + ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API, + ROCPROFILER_BUFFER_TRACING_HSA_FINALIZER_API}; + + if(hsa_domains.count(record.kind) > 0) + { + auto* payload = static_cast(record.payload); + + hsa_status_t status = payload->retval.hsa_status_t_retval; + if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT && status != HSA_STATUS_SUCCESS) + { + const char* _kind = nullptr; + const char* _operation = nullptr; + + rocprofiler_query_callback_tracing_kind_name(record.kind, &_kind, nullptr); + rocprofiler_query_callback_tracing_kind_operation_name( + record.kind, record.operation, &_operation, nullptr); + + // message that + fprintf(stderr, "[domain=%s] %s returned a non-zero exit code: %i\n", _kind, _operation, status); + } + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + { + auto delta_ts = (ts - user_data->value); + // ... etc. ... + } + else + { + // ... etc. ... + } +} +``` + +### Sample `rocprofiler_iterate_callback_tracing_kind_operation_args` + +```cpp +int +print_args(rocprofiler_callback_tracing_kind_t domain_idx, + uint32_t op_idx, + uint32_t arg_num, + const void* const arg_value_addr, + int32_t arg_indirection_count, + const char* arg_type, + const char* arg_name, + const char* arg_value_str, + int32_t arg_dereference_count, + void* data) +{ + if(arg_num == 0) + { + const char* _kind = nullptr; + const char* _operation = nullptr; + + rocprofiler_query_callback_tracing_kind_name(domain_idx, &_kind, nullptr); + rocprofiler_query_callback_tracing_kind_operation_name( + domain_idx, op_idx, &_operation, nullptr); + + fprintf(stderr, "\n[%s] %s\n", _kind, _operation); + } + + char* _arg_type = abi::__cxa_demangle(arg_type, nullptr, nullptr, nullptr); + + fprintf(stderr, " %u: %-18s %-16s = %s\n", arg_num, _arg_type, arg_name, arg_value_str); + + free(_arg_type); + + // unused in example + (void) arg_value_addr; + (void) arg_indirection_count; + (void) arg_dereference_count; + (void) data; + + return 0; +} + +void +callback_func(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* cb_data) +{ + if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT && + record.kind == ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API && + (record.operation == ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchKernel || + record.operation == ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAsync)) + { + rocprofiler_iterate_callback_tracing_kind_operation_args( + record, print_args, record.phase, nullptr)); + } +} +``` + +Sample Output: + +```console + +[HIP_RUNTIME_API] hipLaunchKernel + 0: void const* function_address = 0x219308 + 1: rocprofiler_dim3_t numBlocks = {z=1, y=310, x=310} + 2: rocprofiler_dim3_t dimBlocks = {z=1, y=32, x=32} + 3: void** args = 0x7ffe6d8dd3c0 + 4: unsigned long sharedMemBytes = 0 + 5: ihipStream_t* stream = 0x17b40c0 + +[HIP_RUNTIME_API] hipMemcpyAsync + 0: void* dst = 0x7f06c7bbb010 + 1: void const* src = 0x7f0698800000 + 2: unsigned long sizeBytes = 393625600 + 3: hipMemcpyKind kind = DeviceToHost + 4: ihipStream_t* stream = 0x25dfcf0 +``` + ## Code Object Tracing -## HSA API Tracing +The code object tracing service is a critical component for obtaining information regarding +asynchronous activity on the GPU. The `rocprofiler_callback_tracing_code_object_load_data_t` +payload (kind=`ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT`, operation=`ROCPROFILER_CODE_OBJECT_LOAD`) +provides a unique identifier for a bundle of one or more GPU kernel symbols which have been loaded +for a specific GPU agent. For example, if your application is leveraging a multi-GPU system system +containing 4 Vega20 GPUs and 4 MI100 GPUs, there will at least 8 code objects loaded: one code +object for each GPU. Each code object will be associated with a set of kernel symbols: +the `rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t` payload +(kind=`ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT`, operation=`ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER`) +provides a globally unique identifier for the specific kernel symbol along with the kernel name and +several other static properties of the kernel (e.g. scratch size, scalar general purpose register count, etc.). +Note: two otherwise identical kernel symbols (same kernel name, scratch size, etc.) which are part of +otherwise identical code objects but the code objects are loaded for different GPU agents ***will*** have unique +kernel identifiers. Furthermore, if the same code object (and it's kernel symbols) are unloaded and then +re-loaded, that code object and all of it's kernel symbols ***will*** be given new unique identifiers. + +In general, when a code object is loaded and unloaded, here is the sequence of events: + +1. Callback: code object load + - kind=`ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT` + - operation=`ROCPROFILER_CODE_OBJECT_LOAD` + - phase=`ROCPROFILER_CALLBACK_PHASE_LOAD` +2. Callback: kernel symbol load + - kind=`ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT` + - operation=`ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER` + - phase=`ROCPROFILER_CALLBACK_PHASE_LOAD` + - Repeats for each kernel symbol in code object +3. Application Execution +4. Callback: kernel symbol unload + - kind=`ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT` + - operation=`ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER` + - phase=`ROCPROFILER_CALLBACK_PHASE_UNLOAD` + - Repeats for each kernel symbol in code object +5. Callback: code object unload + - kind=`ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT` + - operation=`ROCPROFILER_CODE_OBJECT_LOAD` + - phase=`ROCPROFILER_CALLBACK_PHASE_UNLOAD` + +Note: rocprofiler-sdk does not provide an interface to query this information outside of the +code object tracing service. If you wish to be able to associate kernel names with kernel tracing records, +a tool is personally responsible for making a copy of the relevant information when the code objects and +kernel symbol are loaded (however, any constant string fields like the (`const char* kernel_name` field) +need not to be copied, these are guaranteed to be valid pointers until after rocprofiler-sdk finalization). +If a tool decides to delete their copy of the data associated with a given code object or kernel symbol +identifier when the code object and kernel symbols are unloaded, it is highly recommended to flush +any/all buffers which might contain references to that code object or kernel symbol identifiers before +deleting the associated data. + +For a sample of code object tracing, please see the `samples/code_object_tracing` example in the +[rocprofiler-sdk GitHub repository](https://github.com/ROCm/rocproifler-sdk). diff --git a/source/docs/intercept_table.md b/source/docs/intercept_table.md index 7a7c4049..54a95093 100644 --- a/source/docs/intercept_table.md +++ b/source/docs/intercept_table.md @@ -1,3 +1,96 @@ # Runtime Intercept Tables -Discussion on how access the raw runtime intercept tables of HSA and HIP (i.e. ExaTracer requirements by LTTng). +Although most tools will want to leverage the callback or buffer tracing services for tracing the HIP, HSA, and ROCTx +APIs, rocprofiler-sdk does provide access to the raw API dispatch tables. Each of the aforementioned APIs are +designed similar to the following sample. + +## Dispatch Table Overview + +### Forward Declaration of public C API function + +```cpp +extern "C" +{ +// forward declaration of public C API function +int +foo(int) __attribute__((visibility("default"))); +} +``` + +### Internal Implementation of API function + +```cpp +namespace impl +{ +int +foo(int val) +{ + // real implementation + return (2 * val); +} +} +``` + +### Dispatch Table Implementation + +```cpp +namespace impl +{ +struct dispatch_table +{ + int (*foo_fn)(int) = nullptr; +}; + +// invoked once: populates the dispatch_table with function pointers to implementation +dispatch_table*& +construct_dispatch_table() +{ + static dispatch_table* tbl = new dispatch_table{}; + tbl->foo_fn = impl::foo; + + // in between above and below, rocprofiler-sdk gets passed the pointer + // to the dispatch table and has the opportunity to wrap the function + // pointers for interception + + return tbl; +} + +// constructs dispatch table and stores it in static variable +dispatch_table* +get_dispatch_table() +{ + static dispatch_table*& tbl = construct_dispatch_table(); + return tbl; +} +} // namespace impl +``` + +### Implementaiton of public C API function + +```cpp +extern "C" +{ +// implementation of public C API function +int +foo(int val) +{ + return impl::get_dispatch_table()->foo_fn(val); +} +} +``` + +### Dispatch Table Chaining + +rocprofiler-sdk is given an opportunity within `impl::construct_dispatch_table()` to +save the original value(s) of the function pointers such as `foo_fn` and install +it's own function pointers in its place -- this results in the public C API function `foo` +calling into the rocprofiler-sdk function pointer, which then in turn, calls the original +function pointer to `impl::foo` (this is called "chaining"). Once rocprofiler-sdk +has made any necessary modifications to the dispatch table, tools which indicated +they also want access to the raw dispatch table via `rocprofiler_at_intercept_table_registration` +will be passed the pointer to the dispatch table. + +## Sample + +For a demo of dispatch table chaining, please see the `samples/intercept_table` example in the +[rocprofiler-sdk GitHub repository](https://github.com/ROCm/rocproifler-sdk). From 69caa62b60dc65cfd80f14852701a34f53425c92 Mon Sep 17 00:00:00 2001 From: srawat <120587655+SwRaw@users.noreply.github.com> Date: Sat, 3 Aug 2024 00:38:04 +0530 Subject: [PATCH 7/7] rocprofv3 doc updates (#982) * updating rocprofv3 * using rocprofv3 * review updates * naming standardization * Update source/docs/how-to/using-rocprofv3.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * review comments * adding API references * kernel filtering * Remove Sphinx warn as error To bypass false warning for linking between rst and md * remove unused (duplicate) refs in _toc.yml.in --------- Co-authored-by: Gopesh Bhardwaj Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> Co-authored-by: Sam Wu <22262939+samjwu@users.noreply.github.com> Co-authored-by: Peter Jun Park --- source/docs/_toc.yml.in | 17 +- .../{ => api-reference}/buffered_services.md | 2 +- .../{ => api-reference}/callback_services.md | 2 +- .../counter_collection_services.md | 2 +- .../{ => api-reference}/intercept_table.md | 2 +- .../docs/{ => api-reference}/pc_sampling.md | 2 +- .../tool_library.md} | 12 - .../comparing-with-legacy-tools.rst} | 28 +- source/docs/data/counter_collection.csv | 2 + source/docs/data/kernel_names.csv | 5 + source/docs/how-to/samples.md | 4 +- source/docs/how-to/using-rocprofv3.rst | 320 +++++++----------- source/docs/index.rst | 30 +- source/docs/install/installation.md | 8 +- source/scripts/update-docs.sh | 2 +- 15 files changed, 193 insertions(+), 245 deletions(-) rename source/docs/{ => api-reference}/buffered_services.md (99%) rename source/docs/{ => api-reference}/callback_services.md (99%) rename source/docs/{ => api-reference}/counter_collection_services.md (99%) rename source/docs/{ => api-reference}/intercept_table.md (98%) rename source/docs/{ => api-reference}/pc_sampling.md (98%) rename source/docs/{tool_library_overview.md => api-reference/tool_library.md} (98%) rename source/docs/{what-is-rocprof-sdk.rst => conceptual/comparing-with-legacy-tools.rst} (53%) create mode 100644 source/docs/data/counter_collection.csv create mode 100644 source/docs/data/kernel_names.csv diff --git a/source/docs/_toc.yml.in b/source/docs/_toc.yml.in index 8cc75d39..f6987bfe 100644 --- a/source/docs/_toc.yml.in +++ b/source/docs/_toc.yml.in @@ -6,14 +6,6 @@ defaults: root: index subtrees: - - entries: - - file: what-is-rocprof-sdk - - file: buffered_services.md - - file: callback_services.md - - file: counter_collection_services.md - - file: intercept_table.md - - file: pc_sampling.md - - file: tool_library_overview.md - caption: Install entries: - file: install/installation @@ -23,8 +15,17 @@ subtrees: - file: how-to/samples - caption: API reference entries: + - file: api-reference/buffered_services + - file: api-reference/callback_services + - file: api-reference/counter_collection_services + - file: api-reference/intercept_table + - file: api-reference/pc_sampling + - file: api-reference/tool_library - file: _doxygen/html/index title: API library + - caption: Conceptual + entries: + - file: conceptual/comparing-with-legacy-tools - caption: License entries: - file: license diff --git a/source/docs/buffered_services.md b/source/docs/api-reference/buffered_services.md similarity index 99% rename from source/docs/buffered_services.md rename to source/docs/api-reference/buffered_services.md index 77d09027..f6a7eead 100644 --- a/source/docs/buffered_services.md +++ b/source/docs/api-reference/buffered_services.md @@ -1,4 +1,4 @@ -# Buffered Services +# Buffered services For the buffered approach, supported buffer record categories are enumerated in `rocprofiler_buffer_category_t` category field. diff --git a/source/docs/callback_services.md b/source/docs/api-reference/callback_services.md similarity index 99% rename from source/docs/callback_services.md rename to source/docs/api-reference/callback_services.md index 6744d9d4..1a458490 100644 --- a/source/docs/callback_services.md +++ b/source/docs/api-reference/callback_services.md @@ -1,4 +1,4 @@ -# Callback Tracing Services +# Callback tracing services ## Overview diff --git a/source/docs/counter_collection_services.md b/source/docs/api-reference/counter_collection_services.md similarity index 99% rename from source/docs/counter_collection_services.md rename to source/docs/api-reference/counter_collection_services.md index 86cf29e8..a7f58b59 100644 --- a/source/docs/counter_collection_services.md +++ b/source/docs/api-reference/counter_collection_services.md @@ -1,4 +1,4 @@ -# Counter Collection Services +# Counter collection services ## Definitions diff --git a/source/docs/intercept_table.md b/source/docs/api-reference/intercept_table.md similarity index 98% rename from source/docs/intercept_table.md rename to source/docs/api-reference/intercept_table.md index 54a95093..58cdc745 100644 --- a/source/docs/intercept_table.md +++ b/source/docs/api-reference/intercept_table.md @@ -1,4 +1,4 @@ -# Runtime Intercept Tables +# Runtime intercept tables Although most tools will want to leverage the callback or buffer tracing services for tracing the HIP, HSA, and ROCTx APIs, rocprofiler-sdk does provide access to the raw API dispatch tables. Each of the aforementioned APIs are diff --git a/source/docs/pc_sampling.md b/source/docs/api-reference/pc_sampling.md similarity index 98% rename from source/docs/pc_sampling.md rename to source/docs/api-reference/pc_sampling.md index c7abfde5..a75cf03e 100644 --- a/source/docs/pc_sampling.md +++ b/source/docs/api-reference/pc_sampling.md @@ -1,4 +1,4 @@ -# PC Sampling Method +# PC sampling method PC Sampling is a profiling method that uses statistical approximation of the kernel execution by sampling GPU program counters. Furthermore, the method periodically chooses an active wave (in a round robin manner) and snapshot it's program counter (PC). The process takes place on every compute unit simultaneously which makes it device-wide PC sampling. The outcome is the histogram of samples that says how many times each kernel instruction was sampled. diff --git a/source/docs/tool_library_overview.md b/source/docs/api-reference/tool_library.md similarity index 98% rename from source/docs/tool_library_overview.md rename to source/docs/api-reference/tool_library.md index b8930e34..0d63f5a5 100644 --- a/source/docs/tool_library_overview.md +++ b/source/docs/api-reference/tool_library.md @@ -143,18 +143,6 @@ tool_init(rocprofiler_client_finalize_t fini_func, Otherwise, ROCprofiler-SDK invokes the `finalize` callback via an `atexit` handler. -## Agent Information - -## Contexts - -## Configuring Services - -## Synchronous Callbacks - -## Asynchronous Callbacks for Buffers - -## Recommendations - ## Full `rocprofiler_configure` Sample All of the snippets from the previous sections have been combined here for convenience. diff --git a/source/docs/what-is-rocprof-sdk.rst b/source/docs/conceptual/comparing-with-legacy-tools.rst similarity index 53% rename from source/docs/what-is-rocprof-sdk.rst rename to source/docs/conceptual/comparing-with-legacy-tools.rst index e4389d1d..82909b99 100644 --- a/source/docs/what-is-rocprof-sdk.rst +++ b/source/docs/conceptual/comparing-with-legacy-tools.rst @@ -1,22 +1,15 @@ .. meta:: - :description: Documentation of the installation, configuration, use of the ROCProfiler SDK, and rocprofv3 command-line tool - :keywords: ROCProfiler SDK tool, ROCProfiler SDK library, rocprofv3, ROCm, API, reference + :description: Documentation of the installation, configuration, use of the ROCprofiler-SDK, and rocprofv3 command-line tool + :keywords: ROCprofiler-SDK tool, ROCprofiler-SDK library, rocprofv3, ROCm, API, reference -.. _what-is-rocprof-sdk: +.. _comparing-with-legacy-tools: -========================== -What is ROCprofiler-SDK? -========================== +======================================================== +Comparing ROCprofiler-SDK to other ROCm profiling tools +======================================================== -ROCprofiler-SDK is a tooling infrastructure for profiling general-purpose GPU compute applications running on the ROCm software. -It supports application tracing to provide a big picture of the GPU application execution and kernel profiling to provide low-level hardware details from the performance counters. -The ROCprofiler-SDK library provides runtime-independent APIs for tracing runtime calls and asynchronous activities such as GPU kernel dispatches and memory moves. The tracing includes callback APIs for runtime API tracing and activity APIs for asynchronous activity records logging. - -In summary, ROCprofiler-SDK combines `ROCProfiler `_ and `ROCTracer `_. -You can utilize the ROCprofiler-SDK to develop a tool for profiling and tracing HIP applications on ROCm software. - -ROCprofiler-SDK is an improved version that enables more efficient implementations and better thread safety while avoiding problems that plague the former implementations of ROCProfiler and ROCTracer. -Here are the distinct ROCprofiler-SDK features: +ROCprofiler-SDK is an improved version of ROCm profiling tools that enables more efficient implementations and better thread safety while avoiding problems that plague the former implementations of ROCProfiler and ROCTracer. +Here are the distinct ROCprofiler-SDK features, which also highlight the improvements over ROCProfiler and ROCTracer: - Improved tool initialization - Support for simultaneous use of the same services by multiple tools @@ -25,10 +18,7 @@ Here are the distinct ROCprofiler-SDK features: - Backward ABI compatibility - PC sampling (beta implementation) -Improvements over ROCProfiler and ROCTracer ----------------------------------------------------- - -The former implementations allow a tool to access any of the services provided by ROCProfiler or ROCTracer such as API tracing, kernel tracing, etc., by calling ``roctracer_init()`` when a ROCm runtime is initially loaded. +The former implementations allow a tool to access any of the services provided by ROCProfiler or ROCTracer, such as API tracing and kernel tracing, by calling ``roctracer_init()`` when an ROCm runtime is initially loaded. As the calling tool is not required to specify during initialization, the services it needs to use, the libraries must be effectively prepared for any service to be available anytime. This behavior introduces unnecessary overhead and makes thread-safe data management difficult, as tools generally don't use all the available services. For example, ROCTracer always installs wrappers around every runtime API and adds indirection overhead through the ROCTracer library to check for the current service configuration in a thread-safe manner. diff --git a/source/docs/data/counter_collection.csv b/source/docs/data/counter_collection.csv new file mode 100644 index 00000000..b650bd02 --- /dev/null +++ b/source/docs/data/counter_collection.csv @@ -0,0 +1,2 @@ +"Correlation_Id","Dispatch_Id","Agent_Id","Queue_Id","Process_Id","Thread_Id","Grid_Size","Kernel_Name","Workgroup_Size","LDS_Block_Size","Scratch_Size","VGPR_Count","SGPR_Count","Counter_Name","Counter_Value" +0,1,1,139892123975680,5619,5619,1048576,"matrixTranspose(float*, float*, int)",16,0,0,8,16,"SQ_WAVES",65536 diff --git a/source/docs/data/kernel_names.csv b/source/docs/data/kernel_names.csv new file mode 100644 index 00000000..c0b571c2 --- /dev/null +++ b/source/docs/data/kernel_names.csv @@ -0,0 +1,5 @@ +"Correlation_Id","Dispatch_Id","Agent_Id","Queue_Id","Process_Id","Thread_Id","Grid_Size","Kernel_Name","Workgroup_Size","LDS_Block_Size","Scratch_Size","VGPR_Count","SGPR_Count","Counter_Name","Counter_Value" +4,4,1,1,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 +8,8,1,2,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 +12,12,1,3,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 +16,16,1,4,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 diff --git a/source/docs/how-to/samples.md b/source/docs/how-to/samples.md index f92fa314..3d6140cf 100644 --- a/source/docs/how-to/samples.md +++ b/source/docs/how-to/samples.md @@ -4,7 +4,7 @@ The samples are provided to help you see the profiler in action. ## Finding samples -After the ROCm build is installed: +The ROCm installation provides sample programs and `rocprofv3` tool. - Sample programs are installed here: @@ -35,7 +35,7 @@ ctest -V ``` :::{note} -Running a few of these tests require you to install Pandas and pytest first. +Running a few of these tests require you to install [pandas](https://pandas.pydata.org/) and [pytest](https://docs.pytest.org/en/stable/) first. ::: ```bash diff --git a/source/docs/how-to/using-rocprofv3.rst b/source/docs/how-to/using-rocprofv3.rst index a13f7ee3..c8c20753 100644 --- a/source/docs/how-to/using-rocprofv3.rst +++ b/source/docs/how-to/using-rocprofv3.rst @@ -1,6 +1,6 @@ .. meta:: - :description: Documentation of the installation, configuration, use of the ROCProfiler SDK, and rocprofv3 command-line tool - :keywords: ROCProfiler SDK tool, ROCProfiler SDK library, rocprofv3, ROCm, API, reference + :description: Documentation of the installation, configuration, use of the ROCprofiler-SDK, and rocprofv3 command-line tool + :keywords: ROCprofiler-SDK tool, ROCprofiler-SDK library, rocprofv3, ROCm, API, reference .. _using-rocprofv3: @@ -8,8 +8,8 @@ Using rocprofv3 ====================== -``rocprofv3`` is a CLI tool that helps you quickly optimize applications and understand the low-level kernel details without requiring any modification in the source code. -It is being developed to be backward compatible with its predecessor, ``rocprof``, and to provide more features for application profiling with better accuracy. +``rocprofv3`` is a CLI tool that helps you quickly optimize applications and understand the low-level kernel details without requiring any modification in the source code. +It's backward compatible with its predecessor, ``rocprof``, and provides more features for application profiling with better accuracy. The following sections demonstrate the use of ``rocprofv3`` for application tracing and kernel profiling using various command-line options. @@ -37,7 +37,7 @@ Here is the list of ``rocprofv3`` command-line options. Some options are used fo * - Option - Description - Use - + * - ``--hip-trace`` - Collects HIP runtime traces. - Application tracing @@ -113,7 +113,7 @@ Here is the list of ``rocprofv3`` command-line options. Some options are used fo * - ``-o`` \| ``--output-file`` - Specifies the name of the output file. Note that this name is appended to the default names (_api_trace or counter_collection.csv) of the generated files'. - Output control - + * - ``-M`` \| ``--mangled-kernels`` - Overrides the default demangling of kernel names. - Output control @@ -125,7 +125,7 @@ Here is the list of ``rocprofv3`` command-line options. Some options are used fo * - ``--output-format`` - For adding output format (supported formats: csv, json, pftrace) - Output control - + * - ``--preload`` - Libraries to prepend to LD_PRELOAD (usually for sanitizers) - Extension @@ -158,9 +158,6 @@ To trace HIP runtime APIs, use: rocprofv3 --hip-trace -- < app_relative_path > -.. note:: - The tracing and counter collection options generate an additional `agent info` file. - The above command generates a `hip_api_trace.csv` file prefixed with the process ID. .. code-block:: shell @@ -170,9 +167,9 @@ The above command generates a `hip_api_trace.csv` file prefixed with the process Here are the contents of `hip_api_trace.csv` file: .. csv-table:: HIP runtime api trace - :file: /data/hip_compile_trace.csv - :widths: 10,10,10,10,10,20,20 - :header-rows: 1 + :file: /data/hip_compile_trace.csv + :widths: 10,10,10,10,10,20,20 + :header-rows: 1 To trace HIP compile time APIs, use: @@ -189,23 +186,12 @@ The above command generates a `hip_api_trace.csv` file prefixed with the process Here are the contents of `hip_api_trace.csv` file: .. csv-table:: HIP compile time api trace - :file: /data/hip_compile_trace.csv - :widths: 10,10,10,10,10,20,20 - :header-rows: 1 + :file: /data/hip_compile_trace.csv + :widths: 10,10,10,10,10,20,20 + :header-rows: 1 For the description of the fields in the output file, see :ref:`output-file-fields`. -Agent Info -'''''''''''''' - -.. code-block:: shell - - $ cat 238_agent_info.csv - - "Node_Id","Logical_Node_Id","Agent_Type","Cpu_Cores_Count","Simd_Count","Cpu_Core_Id_Base","Simd_Id_Base","Max_Waves_Per_Simd","Lds_Size_In_Kb","Gds_Size_In_Kb","Num_Gws","Wave_Front_Size","Num_Xcc","Cu_Count","Array_Count","Num_Shader_Banks","Simd_Arrays_Per_Engine","Cu_Per_Simd_Array","Simd_Per_Cu","Max_Slots_Scratch_Cu","Gfx_Target_Version","Vendor_Id","Device_Id","Location_Id","Domain","Drm_Render_Minor","Num_Sdma_Engines","Num_Sdma_Xgmi_Engines","Num_Sdma_Queues_Per_Engine","Num_Cp_Queues","Max_Engine_Clk_Ccompute","Max_Engine_Clk_Fcompute","Sdma_Fw_Version","Fw_Version","Capability","Cu_Per_Engine","Max_Waves_Per_Cu","Family_Id","Workgroup_Max_Size","Grid_Max_Size","Local_Mem_Size","Hive_Id","Gpu_Id","Workgroup_Max_Dim_X","Workgroup_Max_Dim_Y","Workgroup_Max_Dim_Z","Grid_Max_Dim_X","Grid_Max_Dim_Y","Grid_Max_Dim_Z","Name","Vendor_Name","Product_Name","Model_Name" - 0,0,"CPU",24,0,0,0,0,0,0,0,0,1,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3800,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,"AMD Ryzen 9 3900X 12-Core Processor","CPU","AMD Ryzen 9 3900X 12-Core Processor","" - 1,1,"GPU",0,256,0,2147487744,10,64,0,64,64,1,64,4,4,1,16,4,32,90000,4098,26751,12032,0,128,2,0,2,24,3800,1630,432,440,138420864,16,40,141,1024,4294967295,0,0,64700,1024,1024,1024,4294967295,4294967295,4294967295,"gfx900","AMD","Radeon RX Vega","vega10" - HSA trace +++++++++++++ @@ -214,7 +200,7 @@ The HIP runtime library is implemented with the low-level HSA runtime. HSA API t HSA trace contains the start and end time of HSA runtime API calls and their asynchronous activities. .. code-block:: bash - + rocprofv3 --hsa-trace -- < app_relative_path > The above command generates a `hsa_api_trace.csv` file prefixed with process ID. Note that the contents of this file have been truncated for demonstration purposes. @@ -226,9 +212,9 @@ The above command generates a `hsa_api_trace.csv` file prefixed with process ID. Here are the contents of `hsa_api_trace.csv` file: .. csv-table:: HSA api trace - :file: /data/hsa_trace.csv - :widths: 10,10,10,10,10,20,20 - :header-rows: 1 + :file: /data/hsa_trace.csv + :widths: 10,10,10,10,10,20,20 + :header-rows: 1 For the description of the fields in the output file, see :ref:`output-file-fields`. @@ -284,9 +270,9 @@ Running the preceding command generates a `marker_api_trace.csv` file prefixed w Here are the contents of `marker_api_trace.csv` file: .. csv-table:: Marker api trace - :file: /data/marker_api_trace.csv - :widths: 10,10,10,10,10,20,20 - :header-rows: 1 + :file: /data/marker_api_trace.csv + :widths: 10,10,10,10,10,20,20 + :header-rows: 1 For the description of the fields in the output file, see :ref:`output-file-fields`. @@ -308,10 +294,10 @@ The above command generates a `kernel_trace.csv` file prefixed with the process Here are the contents of `kernel_trace.csv` file: .. csv-table:: Kernel trace - :file: /data/kernel_trace.csv - :widths: 10,10,10,10,10,10,20,20,10,10,10,10,10,10,10,10 + :file: /data/kernel_trace.csv + :widths: 10,10,10,10,10,10,20,20,10,10,10,10,10,10,10,10 :header-rows: 1 - + For the description of the fields in the output file, see :ref:`output-file-fields`. Memory copy trace @@ -332,8 +318,8 @@ The above command generates a `memory_copy_trace.csv` file prefixed with the pro Here are the contents of `memory_copy_trace.csv` file: .. csv-table:: Memory copy trace - :file: /data/memory_copy_trace.csv - :widths: 10,10,10,10,10,20,20 + :file: /data/memory_copy_trace.csv + :widths: 10,10,10,10,10,20,20 :header-rows: 1 For the description of the fields in the output file, see :ref:`output-file-fields`. @@ -377,10 +363,11 @@ The above command generates a `hip_stats.csv` and `hip_api_trace` file prefixed Here are the contents of `hip_stats.csv` file: .. csv-table:: HIP stats - :file: /data/hip_stats.csv - :widths: 10,10,20,20,10,10,10,10 + :file: /data/hip_stats.csv + :widths: 10,10,20,20,10,10,10,10 :header-rows: 1 +For the description of the fields in the output file, see :ref:`output-file-fields`. Kernel profiling ------------------- @@ -392,160 +379,141 @@ For a comprehensive list of counters available on MI200, see `MI200 performance Input file ++++++++++++ -Rocprofv3 supports three input file formats: text (.txt), yaml (.yaml/.yml), or JSON (.json) format. +To collect the desired basic counters or derived metrics, mention them in an input file. In the input file, the line consisting of the counter or metric names must begin with ``pmc``. The input file could be in text (.txt), yaml (.yaml/.yml), or JSON (.json) format. -Text input is used collect the desired basic counters or derived metrics. In the input file, the line consisting of the counter or metric names must begin with ``pmc``. -The input files in JSON/YAML support all commandline options. Using these files each run can be configured with different set of options. -The schema supported by input json and yaml is as given below: - -*Schema for the rocprofv3 JSON/YAML input* +.. code-block:: shell -Properties -++++++++++++ + $ cat input.txt -- **``jobs``** *(array)*: rocprofv3 input data per application run. - - - **Items** *(object)*: data for rocprofv3. - - - **``pmc``** *(array)*: list of counters to collect. - - **``kernel_include_regex``** *(string)*: regex string. - - **``kernel_exclude_regex``** *(string)*: regex string. - - **``kernel_iteration_range``** *(string)*: range for range for - each kernel that match the filter [start-stop]. - - **``hip_trace``** *(boolean)*: For Collecting HIP Traces - (runtime + compiler). - - **``hip_runtime_trace``** *(boolean)*: For Collecting HIP - Runtime API Traces. - - **``hip_compiler_trace``** *(boolean)*: For Collecting HIP - Compiler generated code Traces. - - **``marker_trace``** *(boolean)*: For Collecting Marker (ROCTx) - Traces. - - **``kernel_trace``** *(boolean)*: For Collecting Kernel - Dispatch Traces. - - **``memory_copy_trace``** *(boolean)*: For Collecting Memory - Copy Traces. - - **``scratch_memory_trace``** *(boolean)*: For Collecting - Scratch Memory operations Traces. - - **``stats``** *(boolean)*: For Collecting statistics of enabled - tracing types. - - **``hsa_trace``** *(boolean)*: For Collecting HSA Traces (core - + amd + image + finalizer). - - **``hsa_core_trace``** *(boolean)*: For Collecting HSA API - Traces (core API). - - **``hsa_amd_trace``** *(boolean)*: For Collecting HSA API - Traces (AMD-extension API). - - **``hsa_finalize_trace``** *(boolean)*: For Collecting HSA API - Traces (Finalizer-extension API). - - **``hsa_image_trace``** *(boolean)*: For Collecting HSA API - Traces (Image-extenson API). - - **``sys_trace``** *(boolean)*: For Collecting HIP, HSA, Marker - (ROCTx), Memory copy, Scratch memory, and Kernel dispatch - traces. - - **``mangled-kernels``** *(boolean)*: Do not demangle the kernel - names. - - **``truncate-kernels``** *(boolean)*: Truncate the demangled - kernel names. - - **``output_file``** *(string)*: For the output file name. - - **``output_directory``** *(string)*: For adding output path - where the output files will be saved. - - **``output_format``** *(array)*: For adding output format - (supported formats: csv, json, pftrace). - - **``list_metrics``** *(boolean)*: List the metrics. - - **``log_level``** *(string)*: fatal, error, warning, info, - trace. - - **``preload``** *(array)*: Libraries to prepend to LD_PRELOAD - (usually for sanitizers). - -The number of basic counters or derived metrics that can be collected in one run of profiling are limited by the GPU hardware resources. If too many counters or metrics are selected, the kernels need to be executed multiple times to collect them. -For multi-pass execution, in the input text file include multiple ``pmc`` rows and counters or metrics in each ``pmc`` row can be collected in each kernel run. Whereas Json/Yaml input files have a list of jobs and each job corresponds to a pass/run. + pmc: GPUBusy SQ_WAVES + pmc: GRBM_GUI_ACTIVE .. code-block:: shell $ cat input.json - { - "jobs": [ - { - "hsa_trace": true, - "kernel_trace": true, - "memory_copy_trace": true, - "marker_trace": true, - "output_file": "out", - "output_format": [ - "csv", - "json", - "pftrace" - ] - }, - { - "pmc": [ - "SQ_WAVES" - ], - "kernel_include_regex": ".*_kernel", - "kernel_exclude_regex": "multiply", - "kernel_iteration_range": "[1-2]", - "output_file": "out", - "output_format": [ - "csv", - "json" - ], - "truncate_kernels": true - } - ] - } + { + "metrics": [ + { + "pmc": ["SQ_WAVES", "GRBM_COUNT", "GUI_ACTIVE"] + }, + { + "pmc": ["FETCH_SIZE", "WRITE_SIZE"] + } + ] + } .. code-block:: shell - $ cat input.txt + $ cat input.yaml - pmc: GPUBusy SQ_WAVES - pmc: GRBM_GUI_ACTIVE + metrics: + - pmc: + - SQ_WAVES + - GRBM_COUNT + - GUI_ACTIVE + - 'TCC_HIT[1]' + - 'TCC_HIT[2]' + - pmc: + - FETCH_SIZE + - WRITE_SIZE + +The number of basic counters or derived metrics that can be collected in one run of profiling are limited by the GPU hardware resources. If too many counters or metrics are selected, the kernels need to be executed multiple times to collect them. For multi-pass execution, include multiple ``pmc`` rows in the input file. Counters or metrics in each ``pmc`` row can be collected in each kernel run. + +Kernel profiling output ++++++++++++++++++++++++++ + +To supply the input file for kernel profiling, use: .. code-block:: shell - $ cat input.yml + rocprofv3 -i input.txt -- - jobs: +Running the above command generates a `./pmc_n/counter_collection.csv` file prefixed with the process ID. For each ``pmc`` row, a directory ``pmc_n`` containing a `counter_collection.csv` file is generated, where n = 1 for the first row and so on. - - "hsa_trace": true - "kernel_trace": true - "memory_copy_trace": true - "marker_trace": true - "output_file": "out" - "output_format" - - "csv", - - "json", - - "pftrace" +Each row of the CSV file is an instance of kernel execution. Here is a truncated version of the output file from ``pmc_1``: - - pmc: - - SQ_WAVES - kernel_include_regex: "addition" - kernel_exclude_regex: "multiply" - kernel_iteration_range: - - "[1-2]" - - "[3-4]" - - "[5-6]" +.. code-block:: shell + $ cat pmc_1/218_counter_collection.csv -Kernel profiling output -+++++++++++++++++++++++++ +Here are the contents of `counter_collection.csv` file: -To supply the input file for kernel profiling, use: +.. csv-table:: Counter collection + :file: /data/counter_collection.csv + :widths: 10,10,10,10,10,10,10,10,10,10,10,10,10,10,10 + :header-rows: 1 + +For the description of the fields in the output file, see :ref:`output-file-fields`. + +Kernel names +++++++++++++++ + +To target a specific kernel for counter collection when multiple kernels are present, use the ``--kernel-names`` option: .. code-block:: shell - rocprofv3 -i input.txt -- + rocprofv3 -i input.txt --kernel-names divide_kernel -- Running the above command generates a `./pmc_n/counter_collection.csv` file prefixed with the process ID. For each ``pmc`` row, a directory ``pmc_n`` containing a `counter_collection.csv` file is generated, where n = 1 for the first row and so on. -Each row of the CSV file is an instance of kernel execution. Here is a truncated version of the output file from ``pmc_1``. +Each row of the CSV file is an instance of kernel execution. Here is a truncated version of the output file from ``pmc_1``: + +.. code-block:: shell + + $ cat pmc_1/312_counter_collection.csv + +Here are the contents of `counter_collection.csv` file: + +.. csv-table:: Targeted kernel counter collection + :file: /data/kernel_names.csv + :widths: 10,10,10,10,10,10,10,10,10,10,10,10,10,10,10 + :header-rows: 1 + +Agent info +++++++++++++ + +.. note:: + All tracing and counter collection options generate an additional `agent_info.csv` file prefixed with the process ID. +The `agent_info.csv` file contains information about the CPU or GPU the kernel runs on. + .. code-block:: shell - $ cat pmc_1/218_counter_collection.csv + $ cat 238_agent_info.csv + + "Node_Id","Logical_Node_Id","Agent_Type","Cpu_Cores_Count","Simd_Count","Cpu_Core_Id_Base","Simd_Id_Base","Max_Waves_Per_Simd","Lds_Size_In_Kb","Gds_Size_In_Kb","Num_Gws","Wave_Front_Size","Num_Xcc","Cu_Count","Array_Count","Num_Shader_Banks","Simd_Arrays_Per_Engine","Cu_Per_Simd_Array","Simd_Per_Cu","Max_Slots_Scratch_Cu","Gfx_Target_Version","Vendor_Id","Device_Id","Location_Id","Domain","Drm_Render_Minor","Num_Sdma_Engines","Num_Sdma_Xgmi_Engines","Num_Sdma_Queues_Per_Engine","Num_Cp_Queues","Max_Engine_Clk_Ccompute","Max_Engine_Clk_Fcompute","Sdma_Fw_Version","Fw_Version","Capability","Cu_Per_Engine","Max_Waves_Per_Cu","Family_Id","Workgroup_Max_Size","Grid_Max_Size","Local_Mem_Size","Hive_Id","Gpu_Id","Workgroup_Max_Dim_X","Workgroup_Max_Dim_Y","Workgroup_Max_Dim_Z","Grid_Max_Dim_X","Grid_Max_Dim_Y","Grid_Max_Dim_Z","Name","Vendor_Name","Product_Name","Model_Name" + 0,0,"CPU",24,0,0,0,0,0,0,0,0,1,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3800,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,"AMD Ryzen 9 3900X 12-Core Processor","CPU","AMD Ryzen 9 3900X 12-Core Processor","" + 1,1,"GPU",0,256,0,2147487744,10,64,0,64,64,1,64,4,4,1,16,4,32,90000,4098,26751,12032,0,128,2,0,2,24,3800,1630,432,440,138420864,16,40,141,1024,4294967295,0,0,64700,1024,1024,1024,4294967295,4294967295,4294967295,"gfx900","AMD","Radeon RX Vega","vega10" + +Kernel filtering ++++++++++++++++++ +Kernel filtering allows you to filter the kernel profiling output based on the kernel name by specifying regex strings in the input file. To include kernel names matching the regex string in the kernel profiling output, use ``kernel_include_regex``. To exclude the kernel names matching the regex string from the kernel profiling output, use ``kernel_exclude_regex``. +You can also specify an iteration range for set of iterations of the included kernels. If the iteration range is not specified, then all iterations of the included kernels are profiled. + +Here is an input file with kernel filters: + +.. code-block:: shell + + $ cat input.yml + jobs: + - pmc: [SQ_WAVES] + kernel_include_regex: "divide" + kernel_exclude_regex: "" + +To collect counters for the kernels matching the filters specified in the preceding input file, run: + +.. code-block:: shell + + rocprofv3 -i input.yml -- + + $ cat pass_1/312_counter_collection.csv "Correlation_Id","Dispatch_Id","Agent_Id","Queue_Id","Process_Id","Thread_Id","Grid_Size","Kernel_Name","Workgroup_Size","LDS_Block_Size","Scratch_Size","VGPR_Count","SGPR_Count","Counter_Name","Counter_Value" - 0,1,1,139892123975680,5619,5619,1048576,"matrixTranspose(float*, float*, int)",16,0,0,8,16,"SQ_WAVES",65536 + 4,4,1,1,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 + 8,8,1,2,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 + 12,12,1,3,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 + 16,16,1,4,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 .. _output-file-fields: @@ -605,32 +573,6 @@ The following table lists the various fields or the columns in the output CSV fi * - VGPR_Count - Kernel's Vector General Purpose Register (VGPR) count. -Kernel Filtering -+++++++++++++++++ - -rocprofv3 supports kernel filtering for profiling. A kernel filter is a set of a regex string (to include the kernels matching this filter), a regex string (to exclude the kernels matching this filter), -and an iteration range (set of iterations of the included kernels). If the iteration range is not provided then all iterations of the included kernels are profiled. - -.. code-block:: shell - - $ cat input.yml - jobs: - - pmc: [SQ_WAVES] - kernel_include_regex: "divide" - kernel_exclude_regex: "" - - -.. code-block:: shell - - rocprofv3 -i input.yml -- - - $ cat pass_1/312_counter_collection.csv - "Correlation_Id","Dispatch_Id","Agent_Id","Queue_Id","Process_Id","Thread_Id","Grid_Size","Kernel_Name","Workgroup_Size","LDS_Block_Size","Scratch_Size","VGPR_Count","SGPR_Count","Counter_Name","Counter_Value" - 4,4,1,1,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 - 8,8,1,2,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 - 12,12,1,3,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 - 16,16,1,4,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384 - Output formats ---------------- diff --git a/source/docs/index.rst b/source/docs/index.rst index d97efc90..8aaec858 100644 --- a/source/docs/index.rst +++ b/source/docs/index.rst @@ -1,16 +1,24 @@ .. meta:: - :description: Documentation of the installation, configuration, use of the ROCProfiler SDK, and rocprofv3 command-line tool - :keywords: ROCProfiler SDK tool, ROCProfiler SDK library, rocprofv3, ROCm, API, reference + :description: Documentation of the installation, configuration, use of the ROCprofiler SDK, and rocprofv3 command-line tool + :keywords: ROCprofiler-SDK tool, ROCprofiler-SDK library, rocprofv3, ROCm, API, reference .. _index: ****************************************** -ROCProfiler SDK documentation +ROCprofiler-SDK documentation ****************************************** -ROCProfiler SDK is a comprehensive library that provides APIs for profiling and tracing HIP applications on AMD ROCm Software. To learn more, see :ref:`what-is-rocprof-sdk` +ROCprofiler-SDK is a tooling infrastructure for profiling general-purpose GPU compute applications running on the ROCm software. +It supports application tracing to provide a big picture of the GPU application execution and kernel profiling to provide low-level hardware details from the performance counters. +The ROCprofiler-SDK library provides runtime-independent APIs for tracing runtime calls and asynchronous activities such as GPU kernel dispatches and memory moves. The tracing includes callback APIs for runtime API tracing and activity APIs for asynchronous activity records logging. -You can access ROCProfiler SDK on our `GitHub repository `_. +In summary, ROCprofiler-SDK combines `ROCProfiler `_ and `ROCTracer `_. +You can utilize the ROCprofiler-SDK to develop a tool for profiling and tracing HIP applications on ROCm software. + +The code is open and hosted at ``_. + +.. note:: + ROCprofiler-SDK is in beta and subject to change in future releases. The documentation is structured as follows: @@ -23,12 +31,22 @@ The documentation is structured as follows: .. grid-item-card:: How to - * :doc:`Using rocprofv3 ` + * :ref:`using-rocprofv3` * :doc:`Samples ` .. grid-item-card:: API reference + * :doc:`Buffered services ` + * :doc:`Callback services ` + * :doc:`Counter collection services ` + * :doc:`Intercept table ` + * :doc:`PC sampling ` + * :doc:`Tool library ` * :doc:`API library <_doxygen/html/index>` + + .. grid-item-card:: Conceptual + + * :ref:`comparing-with-legacy-tools` To contribute to the documentation, refer to `Contributing to ROCm `_. diff --git a/source/docs/install/installation.md b/source/docs/install/installation.md index 053e3851..f4150ae3 100644 --- a/source/docs/install/installation.md +++ b/source/docs/install/installation.md @@ -11,7 +11,7 @@ ROCprofiler-SDK is supported only on Linux. The following distributions are test - OpenSUSE 15.4 - RedHat 8.8 -Other [Linux distributions](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-operating-systems) might be supported but not tested yet. +ROCprofiler-SDK might operate as expected on other [Linux distributions](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-operating-systems), but has not been tested. ### Identifying the operating system @@ -31,9 +31,11 @@ The relevant fields are `ID` and the `VERSION_ID`. ## Build requirements -Install [CMake](https://cmake.org/) version 3.21 or higher. +Install [CMake](https://cmake.org/) version 3.21 (or later). -**Note:** If the `CMake` installed on the system is too old, you can install a new version using various methods. One of the easiest options is to use PyPi (Python’s pip). +:::{note} +If the `CMake` installed on the system is too old, you can install a new version using various methods. One of the easiest options is to use PyPi (Python’s pip). +::: ```bash pip install --user 'cmake==3.22.0' diff --git a/source/scripts/update-docs.sh b/source/scripts/update-docs.sh index 09357af3..c4528106 100755 --- a/source/scripts/update-docs.sh +++ b/source/scripts/update-docs.sh @@ -31,7 +31,7 @@ message "Running doxysphinx" doxysphinx build ${WORK_DIR} ${WORK_DIR}/_build/html ${WORK_DIR}/_doxygen/html message "Building html documentation" -make html SPHINXOPTS="-W --keep-going -n" +make html SPHINXOPTS="--keep-going -n" if [ -d ${SOURCE_DIR}/docs ]; then message "Removing stale documentation in ${SOURCE_DIR}/docs/"