Skip to content

Commit

Permalink
Add support for RCCL tracing (#1047)
Browse files Browse the repository at this point in the history
* [Draft]: Add support for RCCL tracing

Address comments

* [Draft]: Add support for RCCL tracing

Address PR comments, changes from RCCL upstream

* Add RCCL library table registration

Working on adding support to rocprofiler-register

* Support compilation w/o <rccl/amd_detail/api_trace.h>

- dummy api_trace.h header
- return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED when RCCL does not have api_trace.h header

* RCCL API tracing tool support

- add to rocprofv3
- add to json-tool

---------

Co-authored-by: Jonathan R. Madsen <[email protected]>
  • Loading branch information
MythreyaK and jrmadsen authored Sep 12, 2024
1 parent 72cbced commit 2a14625
Show file tree
Hide file tree
Showing 43 changed files with 2,270 additions and 30 deletions.
20 changes: 20 additions & 0 deletions cmake/rocprofiler_config_interfaces.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -295,3 +295,23 @@ target_link_libraries(rocprofiler-elfio INTERFACE elfio::elfio)
# ----------------------------------------------------------------------------------------#

target_link_libraries(rocprofiler-otf2 INTERFACE otf2::otf2)

# ----------------------------------------------------------------------------------------#
#
# RCCL
#
# ----------------------------------------------------------------------------------------#
find_package(
rccl
REQUIRED
CONFIG
HINTS
${rocm_version_DIR}
${ROCM_PATH}
PATHS
${rocm_version_DIR}
${ROCM_PATH}
PATH_SUFFIXES
lib/cmake/rccl)

rocprofiler_config_nolink_target(rocprofiler-rccl-nolink rccl::rccl)
2 changes: 2 additions & 0 deletions cmake/rocprofiler_interfaces.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,5 @@ rocprofiler_add_interface_library(
rocprofiler_add_interface_library(
rocprofiler-hsakmt-nolink "rocprofiler-hsakmt without linking to HSAKMT library"
IMPORTED)
rocprofiler_add_interface_library(rocprofiler-rccl-nolink
"RCCL headers without linking to RCCL library" IMPORTED)
12 changes: 10 additions & 2 deletions source/bin/rocprofv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,13 @@ def add_parser_bool_argument(gparser, *args, **kwargs):
aggregate_tracing_options,
"-r",
"--runtime-trace",
help="Collect tracing data for HIP runtime API, Marker (ROCTx) API, Memory operations (copies and scratch), and Kernel dispatches. Similar to --sys-trace but without tracing HIP compiler API and the underlying HSA API.",
help="Collect tracing data for HIP runtime API, Marker (ROCTx) API, RCCL API, Memory operations (copies and scratch), and Kernel dispatches. Similar to --sys-trace but without tracing HIP compiler API and the underlying HSA API.",
)
add_parser_bool_argument(
aggregate_tracing_options,
"-s",
"--sys-trace",
help="Collect tracing data for HIP API, HSA API, Marker (ROCTx) API, Memory operations (copies and scratch), and Kernel dispatches.",
help="Collect tracing data for HIP API, HSA API, Marker (ROCTx) API, RCCL API, Memory operations (copies and scratch), and Kernel dispatches.",
)

basic_tracing_options = parser.add_argument_group("Basic tracing options")
Expand Down Expand Up @@ -183,6 +183,11 @@ def add_parser_bool_argument(gparser, *args, **kwargs):
"--hsa-trace",
help="For collecting HSA Traces (core + amd + image + finalizer)",
)
add_parser_bool_argument(
basic_tracing_options,
"--rccl-trace",
help="For collecting RCCL Traces",
)

extended_tracing_options = parser.add_argument_group("Granular tracing options")

Expand Down Expand Up @@ -654,6 +659,7 @@ def _write_env_value():
"kernel_trace",
"memory_copy_trace",
"scratch_memory_trace",
"rccl_trace",
):
setattr(args, itr, True)

Expand All @@ -664,6 +670,7 @@ def _write_env_value():
"kernel_trace",
"memory_copy_trace",
"scratch_memory_trace",
"rccl_trace",
):
setattr(args, itr, True)

Expand All @@ -686,6 +693,7 @@ def _write_env_value():
["hsa_image_trace", "HSA_IMAGE_EXT_API_TRACE"],
["hsa_finalizer_trace", "HSA_FINALIZER_EXT_API_TRACE"],
["marker_trace", "MARKER_API_TRACE"],
["rccl_trace", "RCCL_API_TRACE"],
["kernel_trace", "KERNEL_TRACE"],
["memory_copy_trace", "MEMORY_COPY_TRACE"],
["scratch_memory_trace", "SCRATCH_MEMORY_TRACE"],
Expand Down
18 changes: 18 additions & 0 deletions source/include/rocprofiler-sdk/buffer_tracing.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,24 @@ typedef struct
/// ::rocprofiler_marker_name_api_id_t
} rocprofiler_buffer_tracing_marker_api_record_t;

/**
* @brief ROCProfiler Buffer RCCL API Record.
*/
typedef struct
{
uint64_t size; ///< size of this struct
rocprofiler_buffer_tracing_kind_t kind;
rocprofiler_tracing_operation_t operation;
rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_thread_id_t thread_id; ///< id for thread generating this record

/// @var kind
/// @brief ::ROCPROFILER_CALLBACK_TRACING_RCCL_API,
/// @brief Specification of the API function, e.g., ::rocprofiler_rccl_api_id_t,
} rocprofiler_buffer_tracing_rccl_api_record_t;

/**
* @brief ROCProfiler Buffer Memory Copy Tracer Record.
*/
Expand Down
11 changes: 11 additions & 0 deletions source/include/rocprofiler-sdk/callback_tracing.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <rocprofiler-sdk/hip.h>
#include <rocprofiler-sdk/hsa.h>
#include <rocprofiler-sdk/marker.h>
#include <rocprofiler-sdk/rccl.h>

#include <hsa/hsa.h>
#include <hsa/hsa_amd_tool.h>
Expand Down Expand Up @@ -87,6 +88,16 @@ typedef struct
rocprofiler_marker_api_retval_t retval;
} rocprofiler_callback_tracing_marker_api_data_t;

/**
* @brief ROCProfiler RCCL API Callback Data.
*/
typedef struct
{
uint64_t size; ///< size of this struct
rocprofiler_rccl_api_args_t args;
rocprofiler_rccl_api_retval_t retval;
} rocprofiler_callback_tracing_rccl_api_data_t;

/**
* @brief ROCProfiler Code Object Load Tracer Callback Record.
*/
Expand Down
2 changes: 2 additions & 0 deletions source/include/rocprofiler-sdk/cxx/perfetto.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,15 @@ struct perfetto_category;
ROCPROFILER_DEFINE_CATEGORY(category, hsa_api, "HSA API function")
ROCPROFILER_DEFINE_CATEGORY(category, hip_api, "HIP API function")
ROCPROFILER_DEFINE_CATEGORY(category, marker_api, "Marker API region")
ROCPROFILER_DEFINE_CATEGORY(category, rccl_api, "RCCL API function")
ROCPROFILER_DEFINE_CATEGORY(category, kernel_dispatch, "GPU kernel dispatch")
ROCPROFILER_DEFINE_CATEGORY(category, memory_copy, "Async memory copy")

#define ROCPROFILER_PERFETTO_CATEGORIES \
ROCPROFILER_PERFETTO_CATEGORY(category::hsa_api), \
ROCPROFILER_PERFETTO_CATEGORY(category::hip_api), \
ROCPROFILER_PERFETTO_CATEGORY(category::marker_api), \
ROCPROFILER_PERFETTO_CATEGORY(category::rccl_api), \
ROCPROFILER_PERFETTO_CATEGORY(category::kernel_dispatch), \
ROCPROFILER_PERFETTO_CATEGORY(category::memory_copy)

Expand Down
23 changes: 23 additions & 0 deletions source/include/rocprofiler-sdk/cxx/serialization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,22 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_memory_copy_data_t data)
ROCP_SDK_SAVE_DATA_FIELD(bytes);
}

template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_rccl_api_retval_t data)
{
ROCP_SDK_SAVE_DATA_FIELD(ncclResult_t_retval);
}

template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_callback_tracing_rccl_api_data_t data)
{
ROCP_SDK_SAVE_DATA_FIELD(size);
// ROCP_SDK_SAVE_DATA_FIELD(args);
ROCP_SDK_SAVE_DATA_FIELD(retval);
}

template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_profile_counting_dispatch_data_t data)
Expand Down Expand Up @@ -405,6 +421,13 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_marker_api_record_t data)
save_buffer_tracing_api_record(ar, data);
}

template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_buffer_tracing_rccl_api_record_t data)
{
save_buffer_tracing_api_record(ar, data);
}

template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_buffer_tracing_kernel_dispatch_record_t data)
Expand Down
1 change: 1 addition & 0 deletions source/include/rocprofiler-sdk/external_correlation.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ typedef enum // NOLINT(performance-enum-size)
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_COPY,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_SCRATCH_MEMORY,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_RCCL_API,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_LAST,
} rocprofiler_external_correlation_id_request_kind_t;

Expand Down
8 changes: 6 additions & 2 deletions source/include/rocprofiler-sdk/fwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ typedef enum // NOLINT(performance-enum-size)
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, ///< @see ::rocprofiler_scratch_memory_operation_t
ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ///< Callbacks for kernel dispatches
ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, ///< @see ::rocprofiler_memory_copy_operation_t
ROCPROFILER_CALLBACK_TRACING_RCCL_API, ///< @RCCL tracing
ROCPROFILER_CALLBACK_TRACING_LAST,
} rocprofiler_callback_tracing_kind_t;

Expand All @@ -193,6 +194,7 @@ typedef enum // NOLINT(performance-enum-size)
ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, ///< Buffer page migration info
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, ///< Buffer scratch memory reclaimation info
ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT, ///< Correlation ID in no longer in use
ROCPROFILER_BUFFER_TRACING_RCCL_API, ///< RCCL tracing
ROCPROFILER_BUFFER_TRACING_LAST,
} rocprofiler_buffer_tracing_kind_t;

Expand Down Expand Up @@ -350,7 +352,8 @@ typedef enum
ROCPROFILER_HSA_LIBRARY = (1 << 1),
ROCPROFILER_HIP_LIBRARY = (1 << 2),
ROCPROFILER_MARKER_LIBRARY = (1 << 3),
ROCPROFILER_LIBRARY_LAST = ROCPROFILER_MARKER_LIBRARY,
ROCPROFILER_RCCL_LIBRARY = (1 << 4),
ROCPROFILER_LIBRARY_LAST = ROCPROFILER_RCCL_LIBRARY,
} rocprofiler_runtime_library_t;

/**
Expand All @@ -365,7 +368,8 @@ typedef enum
ROCPROFILER_MARKER_CORE_TABLE = (1 << 3),
ROCPROFILER_MARKER_CONTROL_TABLE = (1 << 4),
ROCPROFILER_MARKER_NAME_TABLE = (1 << 5),
ROCPROFILER_TABLE_LAST = ROCPROFILER_MARKER_NAME_TABLE,
ROCPROFILER_RCCL_TABLE = (1 << 6),
ROCPROFILER_TABLE_LAST = ROCPROFILER_RCCL_TABLE,
} rocprofiler_intercept_table_t;

/**
Expand Down
27 changes: 27 additions & 0 deletions source/include/rocprofiler-sdk/rccl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#pragma once

#include <rocprofiler-sdk/rccl/api_args.h>
#include <rocprofiler-sdk/rccl/api_id.h>
#include <rocprofiler-sdk/rccl/table_id.h>
Loading

0 comments on commit 2a14625

Please sign in to comment.