From 94b5d9be3fe0fddc64262a92203e1c5b3f26ced7 Mon Sep 17 00:00:00 2001 From: SrirakshaNag <104580803+SrirakshaNag@users.noreply.github.com> Date: Wed, 31 Jul 2024 22:46:01 -0500 Subject: [PATCH] Adding changes for handling abort signals (#979) * Adding changes for handling abort signals * Fix the test failure * Fixing CmakeLists error * Addressing review comments * fixing warnings * fixing execute test * Fixing abort app test * Address review comments * Apply suggestions from code review * Apply suggestions from code review * Fixes for testing issues * Adding kernel filtering test * Removing text input file * fix formatting issues * misc fix * Suppress signal-unsafe error in ThreadSanitizer - rename signal handler to rocprofv3_error_signal_handler to ensure specific filtering * Fix rocprofv3 aborted-app validation --------- Co-authored-by: Jonathan R. Madsen Co-authored-by: Jonathan R. Madsen --- source/docs/how-to/using-rocprofv3.rst | 2 +- source/lib/rocprofiler-sdk-tool/tool.cpp | 31 +++++- source/scripts/thread-sanitizer-suppr.txt | 3 + tests/bin/vector-operations/vector-ops.cpp | 15 ++- tests/rocprofv3/CMakeLists.txt | 1 + tests/rocprofv3/aborted-app/CMakeLists.txt | 44 +++++++++ tests/rocprofv3/aborted-app/conftest.py | 23 +++++ tests/rocprofv3/aborted-app/input.json | 16 +++ tests/rocprofv3/aborted-app/pytest.ini | 5 + tests/rocprofv3/aborted-app/validate.py | 108 +++++++++++++++++++++ 10 files changed, 244 insertions(+), 4 deletions(-) create mode 100644 tests/rocprofv3/aborted-app/CMakeLists.txt create mode 100644 tests/rocprofv3/aborted-app/conftest.py create mode 100644 tests/rocprofv3/aborted-app/input.json create mode 100644 tests/rocprofv3/aborted-app/pytest.ini create mode 100644 tests/rocprofv3/aborted-app/validate.py diff --git a/source/docs/how-to/using-rocprofv3.rst b/source/docs/how-to/using-rocprofv3.rst index 39252e89..a13f7ee3 100644 --- a/source/docs/how-to/using-rocprofv3.rst +++ b/source/docs/how-to/using-rocprofv3.rst @@ -608,7 +608,7 @@ The following table lists the various fields or the columns in the output CSV fi Kernel Filtering +++++++++++++++++ -rocprofv3 supports kernel filtering. A kernel filter is a set of a regex string (to include the kernels matching this filter), a regex string (to exclude the kernels matching this filter), +rocprofv3 supports kernel filtering for profiling. A kernel filter is a set of a regex string (to include the kernels matching this filter), a regex string (to exclude the kernels matching this filter), and an iteration range (set of iterations of the included kernels). If the iteration range is not provided then all iterations of the included kernels are profiled. .. code-block:: shell diff --git a/source/lib/rocprofiler-sdk-tool/tool.cpp b/source/lib/rocprofiler-sdk-tool/tool.cpp index aef5c231..dfc839ab 100644 --- a/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -48,13 +48,16 @@ #include #include #include -#include #include #include #include + #include +#include #include +#include +#include #include #include #include @@ -1753,6 +1756,9 @@ get_main_function() static main_func_t user_main = nullptr; return user_main; } + +bool signal_handler_exit = + rocprofiler::tool::get_env("ROCPROF_INTERNAL_TEST_SIGNAL_HANDLER_VIA_EXIT", false); } // namespace #define ROCPROFV3_INTERNAL_API __attribute__((visibility("internal"))); @@ -1761,6 +1767,15 @@ extern "C" { void rocprofv3_set_main(main_func_t main_func) ROCPROFV3_INTERNAL_API; +void +rocprofv3_error_signal_handler(int signo) +{ + finalize_rocprofv3(); + // below is for testing purposes. re-raising the signal causes CTest to ignore WILL_FAIL ON + if(signal_handler_exit) ::exit(signo); + ::raise(signo); +} + int rocprofv3_main(int argc, char** argv, char** envp) ROCPROFV3_INTERNAL_API; @@ -1848,6 +1863,20 @@ rocprofv3_main(int argc, char** argv, char** envp) initialize_rocprofv3(); + struct sigaction sig_act = {}; + sigemptyset(&sig_act.sa_mask); + sig_act.sa_flags = SA_RESETHAND | SA_NODEFER; + sig_act.sa_handler = &rocprofv3_error_signal_handler; + for(auto signal_v : {SIGTERM, SIGSEGV, SIGINT, SIGILL, SIGABRT, SIGFPE}) + { + if(sigaction(signal_v, &sig_act, nullptr) != 0) + { + auto _errno_v = errno; + ROCP_ERROR << "error setting signal handler for " << signal_v + << " :: " << strerror(_errno_v); + } + } + auto ret = CHECK_NOTNULL(get_main_function())(argc, argv, envp); finalize_rocprofv3(); diff --git a/source/scripts/thread-sanitizer-suppr.txt b/source/scripts/thread-sanitizer-suppr.txt index 54d588d3..ea9a4bf2 100644 --- a/source/scripts/thread-sanitizer-suppr.txt +++ b/source/scripts/thread-sanitizer-suppr.txt @@ -27,3 +27,6 @@ mutex:external/ptl/source/PTL/TaskGroup.hh # lock order inversion that cannot happen mutex:source/lib/common/synchronized.hpp + +# signal-unsafe function called from signal handler +signal:rocprofv3_error_signal_handler diff --git a/tests/bin/vector-operations/vector-ops.cpp b/tests/bin/vector-operations/vector-ops.cpp index fdc76723..33ca7145 100644 --- a/tests/bin/vector-operations/vector-ops.cpp +++ b/tests/bin/vector-operations/vector-ops.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -217,6 +218,12 @@ run(int NUM_QUEUE, int DEVICE_ID) HIP_API_CALL(hipGetLastError()); + if(getenv("ROCPROF_TESTING_RAISE_SIGNAL") != nullptr && + std::stoi(getenv("ROCPROF_TESTING_RAISE_SIGNAL")) > 0) + { + ::raise(SIGINT); + } + hipLaunchKernelGGL(multiply_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), @@ -268,13 +275,17 @@ run(int NUM_QUEUE, int DEVICE_ID) } int -main() +main(int argc, char** argv) { + int stream_count = 8; int device_count = 0; HIP_API_CALL(hipGetDeviceCount(&device_count)); + if(argc > 1) stream_count = std::stoi(argv[1]); + if(argc > 2) device_count = std::stoi(argv[2]); + for(int i = 0; i < device_count; ++i) - run(8, i); + run(stream_count, i); return 0; } diff --git a/tests/rocprofv3/CMakeLists.txt b/tests/rocprofv3/CMakeLists.txt index 17e6e8a1..334b84d1 100644 --- a/tests/rocprofv3/CMakeLists.txt +++ b/tests/rocprofv3/CMakeLists.txt @@ -29,3 +29,4 @@ add_subdirectory(tracing-hip-in-libraries) add_subdirectory(counter-collection) add_subdirectory(hsa-queue-dependency) add_subdirectory(kernel-rename) +add_subdirectory(aborted-app) diff --git a/tests/rocprofv3/aborted-app/CMakeLists.txt b/tests/rocprofv3/aborted-app/CMakeLists.txt new file mode 100644 index 00000000..8b5e2702 --- /dev/null +++ b/tests/rocprofv3/aborted-app/CMakeLists.txt @@ -0,0 +1,44 @@ +# +# rocprofv3 tool test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-aborted-app + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py + input.json) + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +set(aborted-app-env "${PRELOAD_ENV}" ROCPROF_TESTING_RAISE_SIGNAL=1 + ROCPROF_INTERNAL_TEST_SIGNAL_HANDLER_VIA_EXIT=1) + +# app-abort +add_test( + NAME rocprofv3-test-execute-app-abort + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input.json -d + ${CMAKE_CURRENT_BINARY_DIR}/out-aborted-app -- $ 1 1) + +set_tests_properties( + rocprofv3-test-execute-app-abort + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${aborted-app-env}" + WILL_FAIL TRUE) + +add_test( + NAME rocprofv3-test-validate-app-abort + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input + ${CMAKE_CURRENT_BINARY_DIR}/out-aborted-app/pass_1/out_results.json) + +set_tests_properties( + rocprofv3-test-validate-app-abort + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS + "rocprofv3-test-execute-app-abort" FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/tests/rocprofv3/aborted-app/conftest.py b/tests/rocprofv3/aborted-app/conftest.py new file mode 100644 index 00000000..437dc547 --- /dev/null +++ b/tests/rocprofv3/aborted-app/conftest.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import json +import pytest +import pandas as pd + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list + + +def pytest_addoption(parser): + parser.addoption( + "--json-input", + action="store", + help="Path to JSON file.", + ) + + +@pytest.fixture +def json_data(request): + filename = request.config.getoption("--json-input") + with open(filename, "r") as inp: + return dotdict(collapse_dict_list(json.load(inp))) diff --git a/tests/rocprofv3/aborted-app/input.json b/tests/rocprofv3/aborted-app/input.json new file mode 100644 index 00000000..a01b2a40 --- /dev/null +++ b/tests/rocprofv3/aborted-app/input.json @@ -0,0 +1,16 @@ +{ + "jobs": [ + { + "pmc":["SQ_WAVES"], + "kernel_include_regex": "addition", + "kernel_exclude_regex": "subtract", + "hip_runtime_trace": true, + "kernel_trace": true, + "output_file": "out", + "output_format": [ + "json" + ], + "truncate_kernels": true + } + ] +} diff --git a/tests/rocprofv3/aborted-app/pytest.ini b/tests/rocprofv3/aborted-app/pytest.ini new file mode 100644 index 00000000..5e1e1c14 --- /dev/null +++ b/tests/rocprofv3/aborted-app/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/tests/rocprofv3/aborted-app/validate.py b/tests/rocprofv3/aborted-app/validate.py new file mode 100644 index 00000000..a4cfa2a4 --- /dev/null +++ b/tests/rocprofv3/aborted-app/validate.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +import sys +import pytest +import re + +kernel_trace_list = sorted(["addition_kernel", "subtract_kernel"]) +kernel_counter_list = ["addition_kernel"] + + +def unique(lst): + return list(set(lst)) + + +def test_counter_collection_json_data(json_data): + data = json_data["rocprofiler-sdk-tool"] + + def get_kernel_name(kernel_id): + return data["kernel_symbols"][kernel_id]["formatted_kernel_name"] + + counter_collection_data = data["callback_records"]["counter_collection"] + + for counter in counter_collection_data: + kernel_name = get_kernel_name(counter.dispatch_data.dispatch_info.kernel_id) + assert kernel_name in kernel_counter_list + + +def test_kernel_trace_json(json_data): + data = json_data["rocprofiler-sdk-tool"] + + def get_kernel_name(kernel_id): + return data["kernel_symbols"][kernel_id]["formatted_kernel_name"] + + def get_kind_name(kind_id): + return data["strings"]["buffer_records"][kind_id]["kind"] + + kernel_dispatch_data = data["buffer_records"]["kernel_dispatch"] + kernels = [] + assert len(kernel_dispatch_data) == 2 + for dispatch in kernel_dispatch_data: + dispatch_info = dispatch["dispatch_info"] + kernel_name = get_kernel_name(dispatch_info["kernel_id"]) + + assert get_kind_name(dispatch["kind"]) == "KERNEL_DISPATCH" + assert dispatch["correlation_id"]["internal"] > 0 + assert dispatch_info["agent_id"]["handle"] > 0 + assert dispatch_info["queue_id"]["handle"] > 0 + assert dispatch_info["kernel_id"] > 0 + if not re.search(r"__amd_rocclr_.*", kernel_name): + kernels.append(kernel_name) + + assert dispatch_info["workgroup_size"]["x"] == 64 + assert dispatch_info["workgroup_size"]["y"] == 1 + assert dispatch_info["workgroup_size"]["z"] == 1 + assert dispatch_info["grid_size"]["x"] == 1024 + assert dispatch_info["grid_size"]["y"] == 1024 + assert dispatch_info["grid_size"]["z"] == 1 + assert dispatch["end_timestamp"] >= dispatch["start_timestamp"] + + assert kernels == kernel_trace_list + + +def test_hip_api_trace_json(json_data): + data = json_data["rocprofiler-sdk-tool"] + + def get_operation_name(kind_id, op_id): + return data["strings"]["buffer_records"][kind_id]["operations"][op_id] + + def get_kind_name(kind_id): + return data["strings"]["buffer_records"][kind_id]["kind"] + + valid_domain_names = ("HIP_RUNTIME_API",) + + hip_api_data = data["buffer_records"]["hip_api"] + + functions = [] + for api in hip_api_data: + kind = get_kind_name(api["kind"]) + assert kind in valid_domain_names + assert api["end_timestamp"] >= api["start_timestamp"] + functions.append(get_operation_name(api["kind"], api["operation"])) + + expected_functions = ( + [ + "hipGetDeviceCount", + "hipSetDevice", + "hipDeviceSynchronize", + "hipStreamCreateWithFlags", + ] + + (["hipHostMalloc"] * 3) + + (["hipMallocAsync"] * 3) + + (["hipMemcpyAsync"] * 2) + + [ + "hipStreamSynchronize", + "hipDeviceSynchronize", + "hipLaunchKernel", + "hipGetLastError", + "hipLaunchKernel", + "hipGetLastError", + ] + ) + + assert functions == expected_functions + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code)