Skip to content

Commit

Permalink
Adding changes for handling abort signals (#979)
Browse files Browse the repository at this point in the history
* Adding changes for handling abort signals

* Fix the test failure

* Fixing CmakeLists error

* Addressing review comments

* fixing warnings

* fixing execute test

* Fixing abort app test

* Address review comments

* Apply suggestions from code review

* Apply suggestions from code review

* Fixes for testing issues

* Adding kernel filtering test

* Removing text input file

* fix formatting issues

* misc fix

* Suppress signal-unsafe error in ThreadSanitizer

- rename signal handler to rocprofv3_error_signal_handler to ensure specific filtering

* Fix rocprofv3 aborted-app validation

---------

Co-authored-by: Jonathan R. Madsen <[email protected]>
Co-authored-by: Jonathan R. Madsen <[email protected]>
  • Loading branch information
3 people authored Aug 1, 2024
1 parent 018f3ce commit 94b5d9b
Show file tree
Hide file tree
Showing 10 changed files with 244 additions and 4 deletions.
2 changes: 1 addition & 1 deletion source/docs/how-to/using-rocprofv3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ The following table lists the various fields or the columns in the output CSV fi
Kernel Filtering
+++++++++++++++++

rocprofv3 supports kernel filtering. A kernel filter is a set of a regex string (to include the kernels matching this filter), a regex string (to exclude the kernels matching this filter),
rocprofv3 supports kernel filtering for profiling. A kernel filter is a set of a regex string (to include the kernels matching this filter), a regex string (to exclude the kernels matching this filter),
and an iteration range (set of iterations of the included kernels). If the iteration range is not provided then all iterations of the included kernels are profiled.

.. code-block:: shell
Expand Down
31 changes: 30 additions & 1 deletion source/lib/rocprofiler-sdk-tool/tool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,16 @@
#include <rocprofiler-sdk/internal_threading.h>
#include <rocprofiler-sdk/marker/api_id.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <algorithm>
#include <rocprofiler-sdk/cxx/hash.hpp>
#include <rocprofiler-sdk/cxx/operators.hpp>

#include <fmt/core.h>

#include <unistd.h>
#include <algorithm>
#include <cassert>
#include <csignal>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <mutex>
Expand Down Expand Up @@ -1753,6 +1756,9 @@ get_main_function()
static main_func_t user_main = nullptr;
return user_main;
}

bool signal_handler_exit =
rocprofiler::tool::get_env("ROCPROF_INTERNAL_TEST_SIGNAL_HANDLER_VIA_EXIT", false);
} // namespace

#define ROCPROFV3_INTERNAL_API __attribute__((visibility("internal")));
Expand All @@ -1761,6 +1767,15 @@ extern "C" {
void
rocprofv3_set_main(main_func_t main_func) ROCPROFV3_INTERNAL_API;

void
rocprofv3_error_signal_handler(int signo)
{
finalize_rocprofv3();
// below is for testing purposes. re-raising the signal causes CTest to ignore WILL_FAIL ON
if(signal_handler_exit) ::exit(signo);
::raise(signo);
}

int
rocprofv3_main(int argc, char** argv, char** envp) ROCPROFV3_INTERNAL_API;

Expand Down Expand Up @@ -1848,6 +1863,20 @@ rocprofv3_main(int argc, char** argv, char** envp)

initialize_rocprofv3();

struct sigaction sig_act = {};
sigemptyset(&sig_act.sa_mask);
sig_act.sa_flags = SA_RESETHAND | SA_NODEFER;
sig_act.sa_handler = &rocprofv3_error_signal_handler;
for(auto signal_v : {SIGTERM, SIGSEGV, SIGINT, SIGILL, SIGABRT, SIGFPE})
{
if(sigaction(signal_v, &sig_act, nullptr) != 0)
{
auto _errno_v = errno;
ROCP_ERROR << "error setting signal handler for " << signal_v
<< " :: " << strerror(_errno_v);
}
}

auto ret = CHECK_NOTNULL(get_main_function())(argc, argv, envp);

finalize_rocprofv3();
Expand Down
3 changes: 3 additions & 0 deletions source/scripts/thread-sanitizer-suppr.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,6 @@ mutex:external/ptl/source/PTL/TaskGroup.hh

# lock order inversion that cannot happen
mutex:source/lib/common/synchronized.hpp

# signal-unsafe function called from signal handler
signal:rocprofv3_error_signal_handler
15 changes: 13 additions & 2 deletions tests/bin/vector-operations/vector-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <stdlib.h>
#include <unistd.h>
#include <algorithm>
#include <csignal>
#include <iostream>
#include <mutex>
#include <vector>
Expand Down Expand Up @@ -217,6 +218,12 @@ run(int NUM_QUEUE, int DEVICE_ID)

HIP_API_CALL(hipGetLastError());

if(getenv("ROCPROF_TESTING_RAISE_SIGNAL") != nullptr &&
std::stoi(getenv("ROCPROF_TESTING_RAISE_SIGNAL")) > 0)
{
::raise(SIGINT);
}

hipLaunchKernelGGL(multiply_kernel,
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
Expand Down Expand Up @@ -268,13 +275,17 @@ run(int NUM_QUEUE, int DEVICE_ID)
}

int
main()
main(int argc, char** argv)
{
int stream_count = 8;
int device_count = 0;
HIP_API_CALL(hipGetDeviceCount(&device_count));

if(argc > 1) stream_count = std::stoi(argv[1]);
if(argc > 2) device_count = std::stoi(argv[2]);

for(int i = 0; i < device_count; ++i)
run(8, i);
run(stream_count, i);

return 0;
}
1 change: 1 addition & 0 deletions tests/rocprofv3/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ add_subdirectory(tracing-hip-in-libraries)
add_subdirectory(counter-collection)
add_subdirectory(hsa-queue-dependency)
add_subdirectory(kernel-rename)
add_subdirectory(aborted-app)
44 changes: 44 additions & 0 deletions tests/rocprofv3/aborted-app/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#
# rocprofv3 tool test
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)

project(
rocprofiler-tests-aborted-app
LANGUAGES CXX
VERSION 0.0.0)

find_package(rocprofiler-sdk REQUIRED)

rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py
input.json)

string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")

set(aborted-app-env "${PRELOAD_ENV}" ROCPROF_TESTING_RAISE_SIGNAL=1
ROCPROF_INTERNAL_TEST_SIGNAL_HANDLER_VIA_EXIT=1)

# app-abort
add_test(
NAME rocprofv3-test-execute-app-abort
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
${CMAKE_CURRENT_BINARY_DIR}/input.json -d
${CMAKE_CURRENT_BINARY_DIR}/out-aborted-app -- $<TARGET_FILE:vector-ops> 1 1)

set_tests_properties(
rocprofv3-test-execute-app-abort
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${aborted-app-env}"
WILL_FAIL TRUE)

add_test(
NAME rocprofv3-test-validate-app-abort
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input
${CMAKE_CURRENT_BINARY_DIR}/out-aborted-app/pass_1/out_results.json)

set_tests_properties(
rocprofv3-test-validate-app-abort
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
"rocprofv3-test-execute-app-abort" FAIL_REGULAR_EXPRESSION
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
23 changes: 23 additions & 0 deletions tests/rocprofv3/aborted-app/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env python3

import json
import pytest
import pandas as pd

from rocprofiler_sdk.pytest_utils.dotdict import dotdict
from rocprofiler_sdk.pytest_utils import collapse_dict_list


def pytest_addoption(parser):
parser.addoption(
"--json-input",
action="store",
help="Path to JSON file.",
)


@pytest.fixture
def json_data(request):
filename = request.config.getoption("--json-input")
with open(filename, "r") as inp:
return dotdict(collapse_dict_list(json.load(inp)))
16 changes: 16 additions & 0 deletions tests/rocprofv3/aborted-app/input.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"jobs": [
{
"pmc":["SQ_WAVES"],
"kernel_include_regex": "addition",
"kernel_exclude_regex": "subtract",
"hip_runtime_trace": true,
"kernel_trace": true,
"output_file": "out",
"output_format": [
"json"
],
"truncate_kernels": true
}
]
}
5 changes: 5 additions & 0 deletions tests/rocprofv3/aborted-app/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

[pytest]
addopts = --durations=20 -rA -s -vv
testpaths = validate.py
pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages
108 changes: 108 additions & 0 deletions tests/rocprofv3/aborted-app/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env python3

import sys
import pytest
import re

kernel_trace_list = sorted(["addition_kernel", "subtract_kernel"])
kernel_counter_list = ["addition_kernel"]


def unique(lst):
return list(set(lst))


def test_counter_collection_json_data(json_data):
data = json_data["rocprofiler-sdk-tool"]

def get_kernel_name(kernel_id):
return data["kernel_symbols"][kernel_id]["formatted_kernel_name"]

counter_collection_data = data["callback_records"]["counter_collection"]

for counter in counter_collection_data:
kernel_name = get_kernel_name(counter.dispatch_data.dispatch_info.kernel_id)
assert kernel_name in kernel_counter_list


def test_kernel_trace_json(json_data):
data = json_data["rocprofiler-sdk-tool"]

def get_kernel_name(kernel_id):
return data["kernel_symbols"][kernel_id]["formatted_kernel_name"]

def get_kind_name(kind_id):
return data["strings"]["buffer_records"][kind_id]["kind"]

kernel_dispatch_data = data["buffer_records"]["kernel_dispatch"]
kernels = []
assert len(kernel_dispatch_data) == 2
for dispatch in kernel_dispatch_data:
dispatch_info = dispatch["dispatch_info"]
kernel_name = get_kernel_name(dispatch_info["kernel_id"])

assert get_kind_name(dispatch["kind"]) == "KERNEL_DISPATCH"
assert dispatch["correlation_id"]["internal"] > 0
assert dispatch_info["agent_id"]["handle"] > 0
assert dispatch_info["queue_id"]["handle"] > 0
assert dispatch_info["kernel_id"] > 0
if not re.search(r"__amd_rocclr_.*", kernel_name):
kernels.append(kernel_name)

assert dispatch_info["workgroup_size"]["x"] == 64
assert dispatch_info["workgroup_size"]["y"] == 1
assert dispatch_info["workgroup_size"]["z"] == 1
assert dispatch_info["grid_size"]["x"] == 1024
assert dispatch_info["grid_size"]["y"] == 1024
assert dispatch_info["grid_size"]["z"] == 1
assert dispatch["end_timestamp"] >= dispatch["start_timestamp"]

assert kernels == kernel_trace_list


def test_hip_api_trace_json(json_data):
data = json_data["rocprofiler-sdk-tool"]

def get_operation_name(kind_id, op_id):
return data["strings"]["buffer_records"][kind_id]["operations"][op_id]

def get_kind_name(kind_id):
return data["strings"]["buffer_records"][kind_id]["kind"]

valid_domain_names = ("HIP_RUNTIME_API",)

hip_api_data = data["buffer_records"]["hip_api"]

functions = []
for api in hip_api_data:
kind = get_kind_name(api["kind"])
assert kind in valid_domain_names
assert api["end_timestamp"] >= api["start_timestamp"]
functions.append(get_operation_name(api["kind"], api["operation"]))

expected_functions = (
[
"hipGetDeviceCount",
"hipSetDevice",
"hipDeviceSynchronize",
"hipStreamCreateWithFlags",
]
+ (["hipHostMalloc"] * 3)
+ (["hipMallocAsync"] * 3)
+ (["hipMemcpyAsync"] * 2)
+ [
"hipStreamSynchronize",
"hipDeviceSynchronize",
"hipLaunchKernel",
"hipGetLastError",
"hipLaunchKernel",
"hipGetLastError",
]
)

assert functions == expected_functions


if __name__ == "__main__":
exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
sys.exit(exit_code)

0 comments on commit 94b5d9b

Please sign in to comment.