diff --git a/CMakeLists.txt b/CMakeLists.txt index e6490fe367..036cbe87da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,8 +53,8 @@ endif (__GIT_EXECUTABLE) # This must be set because version tags set(HYDROGEN_VERSION_MAJOR 1) -set(HYDROGEN_VERSION_MINOR 3) -set(HYDROGEN_VERSION_PATCH 4) +set(HYDROGEN_VERSION_MINOR 4) +set(HYDROGEN_VERSION_PATCH 0) set(HYDROGEN_VERSION_MAJOR_MINOR "${HYDROGEN_VERSION_MAJOR}.${HYDROGEN_VERSION_MINOR}") set(HYDROGEN_VERSION @@ -154,13 +154,17 @@ option(Hydrogen_ENABLE_CUDA "Search for CUDA support and enable related features if found." OFF) -if (Hydrogen_ENABLE_CUDA) +option(Hydrogen_ENABLE_ROCM + "Search for ROCm/HIP support and enable related features if found." + OFF) + +if (Hydrogen_ENABLE_CUDA OR Hydrogen_ENABLE_ROCM) option(Hydrogen_ENABLE_CUB "Search for CUB support and enable related features if found." ON) - option(Hydrogen_ENABLE_CUBLAS_TENSOR_MATH - "Use the cuBLAS tensor operation math." + option(Hydrogen_ENABLE_GPU_TENSOR_MATH + "Use the GPU tensor operations when available." OFF) option(Hydrogen_ENABLE_GPU_FP16 @@ -168,6 +172,14 @@ if (Hydrogen_ENABLE_CUDA) ON) endif () +if (Hydrogen_ENABLE_ROCM AND Hydrogen_ENABLE_CUDA) + message(FATAL_ERROR + "ROCm and CUDA code paths are mutually exclusive. " + "Please enable the one that corresponds to your hardware. " + "If you have mixed hardware, please contact the Hydrogen developers " + "as this would be of great interest.") +endif () + # # MEMORY-RELATED OPTIONS # @@ -334,8 +346,8 @@ if (Hydrogen_ENABLE_CUDA) find_package(CUDA REQUIRED) # Enable all the macros find_package(NVML REQUIRED) - if (Hydrogen_ENABLE_CUBLAS_TENSOR_MATH) - set(HYDROGEN_CUBLAS_USE_TENSOR_OP_MATH TRUE) + if (Hydrogen_ENABLE_GPU_TENSOR_MATH) + set(HYDROGEN_GPU_USE_TENSOR_OP_MATH TRUE) endif () if (Hydrogen_ENABLE_GPU_FP16) @@ -387,38 +399,64 @@ if (Hydrogen_ENABLE_CUDA) set(HYDROGEN_HAVE_CUDA FALSE) endif () - endif (Hydrogen_ENABLE_CUDA) -set(HYDROGEN_HAVE_GPU ${HYDROGEN_HAVE_CUDA}) +if (Hydrogen_ENABLE_ROCM) + set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH}) + find_package(HIP REQUIRED) + + if (Hydrogen_ENABLE_CUB) + set(CMAKE_PREFIX_PATH "/opt/rocm/hip" ${CMAKE_PREFIX_PATH}) + set(HIP_FOUND FALSE) + find_package(HIP CONFIG REQUIRED) + find_package(rocPRIM REQUIRED) + find_package(hipCUB REQUIRED) + set(HYDROGEN_HAVE_CUB TRUE) + else () + set(HYDROGEN_HAVE_CUB FALSE) + endif () + + if (HIP_FOUND) + set(CMAKE_CXX_EXTENSIONS FALSE) + find_package(ROCBLAS REQUIRED) + set(HYDROGEN_HAVE_ROCM TRUE) + message(STATUS "Found ROCm/HIP toolchain. Using HIP/ROCm.") + else () + message(FATAL_ERROR "ROCm requested but not found.") + endif () +endif (Hydrogen_ENABLE_ROCM) + +if (HYDROGEN_HAVE_CUDA OR HYDROGEN_HAVE_ROCM) + set(HYDROGEN_HAVE_GPU TRUE) +endif () if (Hydrogen_ENABLE_ALUMINUM) - find_package(Aluminum 0.3.0 NO_MODULE + find_package(Aluminum 0.4.0 NO_MODULE HINTS ${Aluminum_DIR} ${ALUMINUM_DIR} ${AL_DIR} $ENV{Aluminum_DIR} $ENV{ALUMINUM_DIR} $ENV{AL_DIR} PATH_SUFFIXES lib64/cmake/aluminum lib/cmake/aluminum NO_DEFAULT_PATH) if (NOT Aluminum_FOUND) - find_package(Aluminum 0.3.0 NO_MODULE) + find_package(Aluminum 0.4.0 NO_MODULE) endif () if (Aluminum_FOUND) set(HYDROGEN_HAVE_ALUMINUM TRUE) message(STATUS "Found Aluminum: ${Aluminum_DIR}") - if (HYDROGEN_HAVE_CUDA AND AL_HAS_NCCL) + if (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL) set(HYDROGEN_HAVE_NCCL2 TRUE) message(STATUS "Aluminum detected with NCCL2 backend support.") else () set(HYDROGEN_HAVE_NCCL2 FALSE) - endif (HYDROGEN_HAVE_CUDA AND AL_HAS_NCCL) + endif (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL) - if (HYDROGEN_HAVE_CUDA AND AL_HAS_MPI_CUDA) + if (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA) set(HYDROGEN_HAVE_AL_MPI_CUDA TRUE) message(STATUS "Aluminum detected with MPI-CUDA backend support.") else () set(HYDROGEN_HAVE_AL_MPI_CUDA FALSE) - endif (HYDROGEN_HAVE_CUDA AND AL_HAS_MPI_CUDA) + endif (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA) else () set(HYDROGEN_HAVE_ALUMINUM FALSE) set(HYDROGEN_HAVE_NCCL2 FALSE) @@ -497,7 +535,12 @@ configure_file("${PROJECT_SOURCE_DIR}/cmake/configure_files/hydrogen_config.h.in configure_file("${PROJECT_SOURCE_DIR}/doxy/Doxyfile.in" "${PROJECT_BINARY_DIR}/doxy/Doxyfile") -add_library(Hydrogen_CXX "${HYDROGEN_SOURCES}" "${HYDROGEN_HEADERS}") +if (HYDROGEN_HAVE_ROCM) + hip_add_library(Hydrogen_CXX "${HYDROGEN_SOURCES}" "${HYDROGEN_HEADERS}") +else () + add_library(Hydrogen_CXX "${HYDROGEN_SOURCES}" "${HYDROGEN_HEADERS}") +endif () + target_include_directories(Hydrogen_CXX PUBLIC $ $ @@ -509,40 +552,60 @@ target_include_directories(Hydrogen_CXX PUBLIC # be forced to build with that (even though they maybe should)... target_compile_options(Hydrogen_CXX PRIVATE ${EXTRA_CXX_FLAGS}) -target_link_libraries(Hydrogen_CXX PUBLIC ${Aluminum_LIBRARIES}) -target_link_libraries(Hydrogen_CXX PUBLIC ${HALF_LIBRARIES}) - -if (TARGET OpenMP::OpenMP_CXX) - target_link_libraries(Hydrogen_CXX PUBLIC OpenMP::OpenMP_CXX) -endif () -target_link_libraries(Hydrogen_CXX PUBLIC MPI::MPI_CXX) -target_link_libraries(Hydrogen_CXX PUBLIC LAPACK::lapack) -target_link_libraries(Hydrogen_CXX PUBLIC EP::extended_precision) - -target_link_libraries(Hydrogen_CXX PUBLIC ${VTUNE_LIBRARIES}) -target_link_libraries(Hydrogen_CXX PUBLIC ${NVTX_LIBRARIES}) -if (HYDROGEN_HAVE_CUDA) - target_link_libraries(Hydrogen_CXX PUBLIC cuda::toolkit) -endif () +target_link_libraries( + Hydrogen_CXX PUBLIC + ${Aluminum_LIBRARIES} + ${HALF_LIBRARIES} + ${VTUNE_LIBRARIES} + ${NVTX_LIBRARIES} + ${ROCBLAS_LIBRARIES} + $ + $ + $ + $ + $ + $ + $ + ) # Add the CXX library to "Hydrogen" set(HYDROGEN_LIBRARIES Hydrogen_CXX) if (HYDROGEN_HAVE_CUDA) - add_library(Hydrogen_CUDA "${HYDROGEN_CUDA_SOURCES}") + add_library(Hydrogen_CUDA "${HYDROGEN_GPU_SOURCES}") target_include_directories(Hydrogen_CUDA PUBLIC $ $ $) - target_link_libraries(Hydrogen_CUDA PUBLIC ${HALF_LIBRARIES}) - target_link_libraries(Hydrogen_CUDA PUBLIC ${NVTX_LIBRARIES}) - target_link_libraries(Hydrogen_CUDA PUBLIC cuda::toolkit) + target_link_libraries( + Hydrogen_CUDA PUBLIC + ${HALF_LIBRARIES} + ${NVTX_LIBRARIES} + $ + ) target_link_libraries(Hydrogen_CXX PUBLIC Hydrogen_CUDA) list(APPEND HYDROGEN_LIBRARIES Hydrogen_CUDA) endif () +if (HYDROGEN_HAVE_ROCM) + hip_add_library(Hydrogen_ROCM STATIC "${HYDROGEN_GPU_SOURCES}") + target_include_directories(Hydrogen_ROCM PUBLIC + $ + $ + $ + ) + + target_link_libraries(Hydrogen_ROCM PUBLIC + ${HALF_LIBRARIES} + ${ROCBLAS_LIBRARIES} + ) + + #set_target_properties(Hydrogen_ROCM PROPERTIES LINKER_LANGUAGE CXX) + list(APPEND HYDROGEN_LIBRARIES Hydrogen_ROCM) +endif () + # Setup the tests if (Hydrogen_ENABLE_TESTING OR Hydrogen_ENABLE_UNIT_TESTS) include(CTest) diff --git a/cmake/configure_files/HydrogenConfig.cmake.in b/cmake/configure_files/HydrogenConfig.cmake.in index 48656ae6e3..48de9a6b72 100644 --- a/cmake/configure_files/HydrogenConfig.cmake.in +++ b/cmake/configure_files/HydrogenConfig.cmake.in @@ -18,11 +18,14 @@ set(HYDROGEN_MPI_CXX_COMPILER "@MPI_CXX_COMPILER@") set(MPI_CXX_COMPILER "${HYDROGEN_MPI_CXX_COMPILER}" CACHE FILEPATH "The MPI CXX compiler.") -set(_OpenMP_DIR "@OpenMP_DIR@") -if (NOT OpenMP_DIR) - set(OpenMP_DIR "${_OpenMP_DIR}") -endif () -include (FindAndVerifyOpenMP) +set(_HYDROGEN_HAVE_OPENMP @EL_HAVE_OPENMP@) +if (_HYDROGEN_HAVE_OPENMP) + set(_OpenMP_DIR "@OpenMP_DIR@") + if (NOT OpenMP_DIR) + set(OpenMP_DIR "${_OpenMP_DIR}") + endif () + include (FindAndVerifyOpenMP) +endif (_HYDROGEN_HAVE_OPENMP) # FIXME: I should do verification to make sure all found features are # the same. include (FindAndVerifyMPI) @@ -33,14 +36,14 @@ set(_HYDROGEN_HAVE_NCCL2 @HYDROGEN_HAVE_NCCL2@) set(_HYDROGEN_HAVE_AL_MPI_CUDA @HYDROGEN_HAVE_AL_MPI_CUDA@) if (_HYDROGEN_HAVE_ALUMINUM) if (NOT Aluminum_FOUND) - find_package(Aluminum 0.3.0 NO_MODULE QUIET + find_package(Aluminum 0.4.0 NO_MODULE QUIET HINTS ${Aluminum_DIR} ${ALUMINUM_DIR} ${AL_DIR} $ENV{Aluminum_DIR} $ENV{ALUMINUM_DIR} $ENV{AL_DIR} PATH_SUFFIXES lib64/cmake/aluminum lib/cmake/aluminum NO_DEFAULT_PATH) if (NOT Aluminum_FOUND) set(Aluminum_DIR "@Aluminum_DIR@") - find_package(Aluminum 0.3.0 NO_MODULE REQUIRED) + find_package(Aluminum 0.4.0 NO_MODULE REQUIRED) endif () endif () @@ -56,6 +59,16 @@ if (_HYDROGEN_HAVE_ALUMINUM) endif () endif (_HYDROGEN_HAVE_ALUMINUM) +# ROCm +set(_HYDROGEN_HAVE_ROCM @HYDROGEN_HAVE_ROCM@) +if (_HYDROGEN_HAVE_ROCM) + find_package(HIP REQUIRED) + find_package(ROCBLAS REQUIRED) + + # query this beforehand, to set to what it was? + set(CMAKE_CXX_EXTENSIONS FALSE) +endif (_HYDROGEN_HAVE_ROCM) + # CUDA! set(_HYDROGEN_HAVE_CUDA @HYDROGEN_HAVE_CUDA@) set(_HYDROGEN_HAVE_CUB @HYDROGEN_HAVE_CUB@) diff --git a/cmake/configure_files/HydrogenConfigVersion.cmake.in b/cmake/configure_files/HydrogenConfigVersion.cmake.in index 30b7328ec2..4e7195d237 100644 --- a/cmake/configure_files/HydrogenConfigVersion.cmake.in +++ b/cmake/configure_files/HydrogenConfigVersion.cmake.in @@ -7,13 +7,15 @@ # [0.87 1.0.0) # [1.0.0 1.1.0) # [1.1.0 1.2.0) -# [1.2.0 ???) +# [1.2.0 1.3.0) +# [1.3.0 1.4.0) +# [1.4.0 ???) # # IMPORTANT: IF YOU MAKE A BREAKING CHANGE TO HYDROGEN, THE UPDATE # MUST BE GIVEN A NEW VERSION NUMBER, WHICH THEN MUST BE APPENDED TO # THIS LIST. -set(_version_compat_ranges 0.0.0 0.87.0 1.0.0 1.1.0 1.2.0) +set(_version_compat_ranges 0.0.0 0.87.0 1.0.0 1.1.0 1.2.0 1.3.0 1.4.0) # This is the version that has been installed. set(PACKAGE_VERSION "@HYDROGEN_VERSION@") diff --git a/cmake/configure_files/hydrogen_config.h.in b/cmake/configure_files/hydrogen_config.h.in index 5f43053d70..e7c4b0b8bc 100644 --- a/cmake/configure_files/hydrogen_config.h.in +++ b/cmake/configure_files/hydrogen_config.h.in @@ -33,16 +33,18 @@ #cmakedefine HYDROGEN_HAVE_MKL #cmakedefine HYDROGEN_HAVE_MKL_GEMMT +#cmakedefine HYDROGEN_HAVE_GPU + // CUDA stuff #cmakedefine HYDROGEN_HAVE_CUDA -#cmakedefine HYDROGEN_HAVE_CUB -#cmakedefine HYDROGEN_CUBLAS_USE_TENSOR_OP_MATH + +// ROCm stuff +#cmakedefine HYDROGEN_HAVE_ROCM // General GPU stuff -#ifdef HYDROGEN_HAVE_CUDA -#define HYDROGEN_HAVE_GPU +#cmakedefine HYDROGEN_HAVE_CUB +#cmakedefine HYDROGEN_GPU_USE_TENSOR_OP_MATH #cmakedefine HYDROGEN_GPU_USE_FP16 -#endif // HYDROGEN_HAVE_CUDA // Aluminum stuff #cmakedefine HYDROGEN_HAVE_ALUMINUM @@ -62,4 +64,7 @@ #cmakedefine HYDROGEN_DO_BOUNDS_CHECKING +#define H_RESTRICT __restrict__ +#define H_PRETTY_FUNCTION __PRETTY_FUNCTION__ + #endif /* HYDROGEN_CONFIG_H */ diff --git a/cmake/modules/FindROCBLAS.cmake b/cmake/modules/FindROCBLAS.cmake new file mode 100644 index 0000000000..a4e939347d --- /dev/null +++ b/cmake/modules/FindROCBLAS.cmake @@ -0,0 +1,46 @@ +# Find rocBLAS library and supporting header +# +# rocBLAS_DIR or ROCBLAS_DIR[in]: The prefix for rocBLAS +# +# ROCBLAS_INCLUDE_PATH[out,cache]: The include path for rocBLAS +# ROCBLAS_LIBRARY[out,cache]: The rocBLAS library +# +# ROCBLAS_LIBRARIES[out]: The thing to link to for rocBLAS +# ROCBLAS_FOUND[out]: Variable indicating whether rocBLAS has been found +# +# rocm::rocblas: Imported library for rocBLAS +# + +find_path(ROCBLAS_INCLUDE_PATH rocblas.h + HINTS ${rocBLAS_DIR} $ENV{rocBLAS_DIR} ${ROCBLAS_DIR} $ENV{ROCBLAS_DIR} + PATH_SUFFIXES include + NO_DEFAULT_PATH + DOC "The rocBLAS include path.") +find_path(ROCBLAS_INCLUDE_PATH rocblas.h) + +find_library(ROCBLAS_LIBRARY rocblas + HINTS ${rocBLAS_DIR} $ENV{rocBLAS_DIR} ${ROCBLAS_DIR} $ENV{ROCBLAS_DIR} + PATH_SUFFIXES lib64 lib + NO_DEFAULT_PATH + DOC "The rocBLAS library.") +find_library(ROCBLAS_LIBRARY rocblas) + +# Standard handling of the package arguments +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Rocblas + REQUIRED_VARS ROCBLAS_LIBRARY ROCBLAS_INCLUDE_PATH) + +if (NOT TARGET rocblas::rocblas) + add_library(rocblas::rocblas INTERFACE IMPORTED) +endif () + +if (ROCBLAS_INCLUDE_PATH AND ROCBLAS_LIBRARY) + set_target_properties(rocblas::rocblas PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES + "${ROCBLAS_INCLUDE_PATH};/opt/rocm/hsa/include;/opt/rocm/hip/include" + INTERFACE_LINK_LIBRARIES "${ROCBLAS_LIBRARY}") +endif () + +set(ROCBLAS_LIBRARIES rocblas::rocblas) +mark_as_advanced(ROCBLAS_INCLUDE_PATH) +mark_as_advanced(ROCBLAS_LIBRARY) diff --git a/include/El/blas_like/level1/AllReduce.hpp b/include/El/blas_like/level1/AllReduce.hpp index ddace1cca3..4b08c58e9a 100644 --- a/include/El/blas_like/level1/AllReduce.hpp +++ b/include/El/blas_like/level1/AllReduce.hpp @@ -61,11 +61,11 @@ void AllReduce(AbstractMatrix& A, mpi::Comm const& comm, mpi::Op op) case Device::CPU: AllReduce(static_cast&>(A), comm, op); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: AllReduce(static_cast&>(A), comm, op); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("AllReduce: Bad device!"); } diff --git a/include/El/blas_like/level1/Axpy.hpp b/include/El/blas_like/level1/Axpy.hpp index efe1d039bf..3278b61a04 100644 --- a/include/El/blas_like/level1/Axpy.hpp +++ b/include/El/blas_like/level1/Axpy.hpp @@ -26,13 +26,13 @@ void Axpy(S alphaS, AbstractMatrix const& X, AbstractMatrix& Y) static_cast const&>(X), static_cast&>(Y)); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Axpy(alphaS, static_cast const&>(X), static_cast&>(Y)); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Axpy: Bad device."); } @@ -95,7 +95,7 @@ void Axpy(S alphaS, const Matrix& X, Matrix& Y) } } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template>, typename=void> @@ -146,7 +146,7 @@ void Axpy(S alphaS, Matrix const& X, Matrix& Y) mX, nX, alpha, XBuf, ldX, YBuf, ldY, syncInfoY); } } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void Axpy(S alphaS, const ElementalMatrix& X, ElementalMatrix& Y) diff --git a/include/El/blas_like/level1/Axpy/util.hpp b/include/El/blas_like/level1/Axpy/util.hpp index 190550a0f5..c4ebedabfc 100644 --- a/include/El/blas_like/level1/Axpy/util.hpp +++ b/include/El/blas_like/level1/Axpy/util.hpp @@ -9,7 +9,7 @@ #ifndef EL_BLAS_AXPY_UTIL_HPP #define EL_BLAS_AXPY_UTIL_HPP -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #include #endif @@ -35,7 +35,7 @@ void InterleaveMatrixUpdate( &B[rowStrideB*j], colStrideB); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void InterleaveMatrixUpdate( T alpha, Int height, Int width, @@ -47,9 +47,9 @@ void InterleaveMatrixUpdate( hydrogen::Axpy_GPU_impl(height, width, alpha, A, colStrideA, rowStrideA, B, colStrideB, rowStrideB, - syncInfo.stream_); + syncInfo); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void UpdateWithLocalData( diff --git a/include/El/blas_like/level1/AxpyContract.hpp b/include/El/blas_like/level1/AxpyContract.hpp index bb47cf1075..a6ca6d7453 100644 --- a/include/El/blas_like/level1/AxpyContract.hpp +++ b/include/El/blas_like/level1/AxpyContract.hpp @@ -533,11 +533,11 @@ void AxpyContract case Device::CPU: AxpyContract_impl(alpha,A,B); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: AxpyContract_impl(alpha,A,B); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("AxpyContract: Bad device type."); } diff --git a/include/El/blas_like/level1/Broadcast.hpp b/include/El/blas_like/level1/Broadcast.hpp index eb3409ebb7..caff9d99ba 100644 --- a/include/El/blas_like/level1/Broadcast.hpp +++ b/include/El/blas_like/level1/Broadcast.hpp @@ -60,12 +60,12 @@ void Broadcast( AbstractMatrix& A, mpi::Comm const& comm, int rank ) Broadcast_impl(static_cast&>(A), std::move(comm), rank); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Broadcast_impl(static_cast&>(A), std::move(comm), rank); break; -#endif // HYROGEN_HAVE_CUDA +#endif // HYROGEN_HAVE_GPU default: LogicError("Unsupported device type."); } @@ -125,11 +125,11 @@ void Broadcast( AbstractDistMatrix& A, mpi::Comm const& comm, int rank ) case Device::CPU: Broadcast_impl(A, std::move(comm), rank); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Broadcast_impl(A, std::move(comm), rank); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Broadcast: Bad device."); } diff --git a/include/El/blas_like/level1/Contract.hpp b/include/El/blas_like/level1/Contract.hpp index 263565f54c..f51cd7724a 100644 --- a/include/El/blas_like/level1/Contract.hpp +++ b/include/El/blas_like/level1/Contract.hpp @@ -87,11 +87,11 @@ void Contract case Device::CPU: ContractDispatch(A,B); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: ContractDispatch(A,B); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Contract: Bad device type."); } diff --git a/include/El/blas_like/level1/Copy.hpp b/include/El/blas_like/level1/Copy.hpp index 7d835c4c4f..77677b960a 100644 --- a/include/El/blas_like/level1/Copy.hpp +++ b/include/El/blas_like/level1/Copy.hpp @@ -13,11 +13,17 @@ #include #endif +#include + #include #include #include #include +#ifdef HYDROGEN_HAVE_GPU +#include +#endif + #include // Introduce some metaprogramming notions. diff --git a/include/El/blas_like/level1/Copy/ColAllGather.hpp b/include/El/blas_like/level1/Copy/ColAllGather.hpp index a6529d9f35..2f7c8ff8a8 100644 --- a/include/El/blas_like/level1/Copy/ColAllGather.hpp +++ b/include/El/blas_like/level1/Copy/ColAllGather.hpp @@ -194,11 +194,11 @@ void ColAllGather case Device::CPU: ColAllGather_impl(A,B); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: ColAllGather_impl(A,B); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("ColAllGather: Bad device."); } diff --git a/include/El/blas_like/level1/Copy/ColFilter.hpp b/include/El/blas_like/level1/Copy/ColFilter.hpp index b40b54aef3..46619a9873 100644 --- a/include/El/blas_like/level1/Copy/ColFilter.hpp +++ b/include/El/blas_like/level1/Copy/ColFilter.hpp @@ -102,11 +102,11 @@ void ColFilter case Device::CPU: ColFilter_impl(A,B); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: ColFilter_impl(A,B); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("ColFilter: Bad device."); } diff --git a/include/El/blas_like/level1/Copy/Exchange.hpp b/include/El/blas_like/level1/Copy/Exchange.hpp index 28e9098bc0..13f168c710 100644 --- a/include/El/blas_like/level1/Copy/Exchange.hpp +++ b/include/El/blas_like/level1/Copy/Exchange.hpp @@ -139,11 +139,11 @@ void Exchange case Device::CPU: Exchange_impl(A,B,sendRank,recvRank,comm); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Exchange_impl(A,B,sendRank,recvRank,comm); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Exchange: Bad device."); } diff --git a/include/El/blas_like/level1/Copy/PartialColFilter.hpp b/include/El/blas_like/level1/Copy/PartialColFilter.hpp index 75dd08eff0..47c37b1f2f 100644 --- a/include/El/blas_like/level1/Copy/PartialColFilter.hpp +++ b/include/El/blas_like/level1/Copy/PartialColFilter.hpp @@ -112,11 +112,11 @@ void PartialColFilter case Device::CPU: PartialColFilter_impl(A,B); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: PartialColFilter_impl(A,B); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("PartialColFilter: Bad device."); } diff --git a/include/El/blas_like/level1/Copy/PartialRowAllGather.hpp b/include/El/blas_like/level1/Copy/PartialRowAllGather.hpp index 9857c03dbb..45b69e5b5e 100644 --- a/include/El/blas_like/level1/Copy/PartialRowAllGather.hpp +++ b/include/El/blas_like/level1/Copy/PartialRowAllGather.hpp @@ -135,11 +135,11 @@ void PartialRowAllGather case Device::CPU: PartialRowAllGather_impl(A,B); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: PartialRowAllGather_impl(A,B); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("PartialRowAllGather: Bad device."); } diff --git a/include/El/blas_like/level1/Copy/PartialRowFilter.hpp b/include/El/blas_like/level1/Copy/PartialRowFilter.hpp index 56aa379b9f..c9b4805ed5 100644 --- a/include/El/blas_like/level1/Copy/PartialRowFilter.hpp +++ b/include/El/blas_like/level1/Copy/PartialRowFilter.hpp @@ -113,11 +113,11 @@ void PartialRowFilter case Device::CPU: PartialRowFilter_impl(A,B); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: PartialRowFilter_impl(A,B); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("PartialRowFilter: Bad device."); } diff --git a/include/El/blas_like/level1/Copy/RowAllGather.hpp b/include/El/blas_like/level1/Copy/RowAllGather.hpp index 269950ce9a..7193be2269 100644 --- a/include/El/blas_like/level1/Copy/RowAllGather.hpp +++ b/include/El/blas_like/level1/Copy/RowAllGather.hpp @@ -164,11 +164,11 @@ void RowAllGather case Device::CPU: RowAllGather_impl(A,B); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: RowAllGather_impl(A,B); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("RowAllGather: Bad device."); } diff --git a/include/El/blas_like/level1/Copy/RowFilter.hpp b/include/El/blas_like/level1/Copy/RowFilter.hpp index dbc59635ee..5fc097eb24 100644 --- a/include/El/blas_like/level1/Copy/RowFilter.hpp +++ b/include/El/blas_like/level1/Copy/RowFilter.hpp @@ -98,11 +98,11 @@ void RowFilter case Device::CPU: RowFilter_impl(A,B); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: RowFilter_impl(A,B); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("RowFilter: Bad device."); } diff --git a/include/El/blas_like/level1/Copy/TransposeDist.hpp b/include/El/blas_like/level1/Copy/TransposeDist.hpp index bbfe13dce1..4f85db86ef 100644 --- a/include/El/blas_like/level1/Copy/TransposeDist.hpp +++ b/include/El/blas_like/level1/Copy/TransposeDist.hpp @@ -215,7 +215,7 @@ void TransposeDist(DistMatrix const& A, } } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU // FIXME (trb): This should work just fine, but it might not have // optimal performance for row/column vectors (A.Height() or A.Width() @@ -275,7 +275,7 @@ void TransposeDist(DistMatrix const& A, } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void TransposeDist(DistMatrix const& A, diff --git a/include/El/blas_like/level1/Copy/internal_decl.hpp b/include/El/blas_like/level1/Copy/internal_decl.hpp index ce077bf1bc..c7aa1016a6 100644 --- a/include/El/blas_like/level1/Copy/internal_decl.hpp +++ b/include/El/blas_like/level1/Copy/internal_decl.hpp @@ -64,12 +64,12 @@ template const& A, DistMatrix& B ); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template>> void TransposeDist( DistMatrix const& A, DistMatrix& B ); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template>,typename=void> diff --git a/include/El/blas_like/level1/Copy/util.hpp b/include/El/blas_like/level1/Copy/util.hpp index bff09d31d3..c063b29e40 100644 --- a/include/El/blas_like/level1/Copy/util.hpp +++ b/include/El/blas_like/level1/Copy/util.hpp @@ -9,7 +9,7 @@ #ifndef EL_BLAS_COPY_UTIL_HPP #define EL_BLAS_COPY_UTIL_HPP -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #include #endif @@ -229,7 +229,7 @@ void PartialRowStridedUnpack( } } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template >> void DeviceStridedMemCopy( @@ -250,19 +250,17 @@ void InterleaveMatrix( { if (colStrideA == 1 && colStrideB == 1) { - H_CHECK_CUDA( - cudaMemcpy2DAsync(B, rowStrideB*sizeof(T), - A, rowStrideA*sizeof(T), - height*sizeof(T), width, - cudaMemcpyDeviceToDevice, - syncInfo.stream_)); + gpu::Copy2DIntraDevice(A, rowStrideA, + B, rowStrideB, + height, width, + syncInfo); } else { hydrogen::Copy_GPU_impl(height, width, A, colStrideA, rowStrideA, B, colStrideB, rowStrideB, - syncInfo.stream_); + syncInfo); } } @@ -278,12 +276,12 @@ void RowStridedPack( { const Int rowShift = Shift_(k, rowAlign, rowStride); const Int localWidth = Length_(width, rowShift, rowStride); - H_CHECK_CUDA( - cudaMemcpy2DAsync(BPortions + k*portionSize, height*sizeof(T), - A+rowShift*ALDim, rowStride*ALDim*sizeof(T), - height*sizeof(T), localWidth, - cudaMemcpyDeviceToDevice, - syncInfo.stream_)); + + gpu::Copy2DIntraDevice( + A+rowShift*ALDim, rowStride*ALDim, + BPortions + k*portionSize, height, + height, localWidth, + syncInfo); } } @@ -299,12 +297,11 @@ void RowStridedUnpack( { const Int rowShift = Shift_(k, rowAlign, rowStride); const Int localWidth = Length_(width, rowShift, rowStride); - H_CHECK_CUDA( - cudaMemcpy2DAsync(B+rowShift*BLDim, rowStride*BLDim*sizeof(T), - APortions+k*portionSize, height*sizeof(T), - height*sizeof(T), localWidth, - cudaMemcpyDeviceToDevice, - syncInfo.stream_)); + gpu::Copy2DIntraDevice( + APortions+k*portionSize, height, + B+rowShift*BLDim, rowStride*BLDim, + height, localWidth, + syncInfo); } } @@ -324,12 +321,12 @@ void PartialRowStridedPack( rowAlign, rowStride); const Int rowOffset = (rowShift-rowShiftA) / rowStridePart; const Int localWidth = Length_(width, rowShift, rowStride); - H_CHECK_CUDA(cudaMemcpy2DAsync( - BPortions + k*portionSize, height*sizeof(T), - A + rowOffset*ALDim, rowStrideUnion*ALDim*sizeof(T), - height*sizeof(T), localWidth, - cudaMemcpyDeviceToDevice, - syncInfo.stream_)); + + gpu::Copy2DIntraDevice( + A + rowOffset*ALDim, rowStrideUnion*ALDim, + BPortions + k*portionSize, height, + height, localWidth, + syncInfo); } } @@ -349,16 +346,15 @@ void PartialRowStridedUnpack( rowAlign, rowStride); const Int rowOffset = (rowShift-rowShiftB) / rowStridePart; const Int localWidth = Length_(width, rowShift, rowStride); - H_CHECK_CUDA(cudaMemcpy2DAsync( - B + rowOffset*BLDim, rowStrideUnion*BLDim*sizeof(T), - APortions + k*portionSize, height*sizeof(T), - height*sizeof(T), localWidth, - cudaMemcpyDeviceToDevice, - syncInfo.stream_)); + gpu::Copy2DIntraDevice( + APortions + k*portionSize, height, + B + rowOffset*BLDim, rowStrideUnion*BLDim, + height, localWidth, + syncInfo); } } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void ColStridedPack( diff --git a/include/El/blas_like/level1/CopyAsyncDistMatrix.hpp b/include/El/blas_like/level1/CopyAsyncDistMatrix.hpp index bcc23dec7b..22adba7eca 100644 --- a/include/El/blas_like/level1/CopyAsyncDistMatrix.hpp +++ b/include/El/blas_like/level1/CopyAsyncDistMatrix.hpp @@ -47,13 +47,13 @@ void CopyAsync(ElementalMatrix const& A, DistMatrix& B) static_cast const&>(A), B); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: CopyAsync( static_cast const&>(A), B); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("CopyAsync: Unknown device type."); } diff --git a/include/El/blas_like/level1/CopyAsyncLocal.hpp b/include/El/blas_like/level1/CopyAsyncLocal.hpp index 10e3add816..b19246c01a 100644 --- a/include/El/blas_like/level1/CopyAsyncLocal.hpp +++ b/include/El/blas_like/level1/CopyAsyncLocal.hpp @@ -53,8 +53,8 @@ void CopyAsyncImpl(Matrix const& A, const T* EL_RESTRICT ABuf = A.LockedBuffer(); T* EL_RESTRICT BBuf = B.Buffer(); - InterDeviceCopy::MemCopy2DAsync( - BBuf, ldB, ABuf, ldA, height, width, B.Stream()); + details::InterdeviceCopy::Copy2DAsync( + ABuf, ldA, BBuf, ldB, height, width, SyncInfoFromMatrix(B)); } template const& A, const T* EL_RESTRICT ABuf = A.LockedBuffer(); T* EL_RESTRICT BBuf = B.Buffer(); - InterDeviceCopy::MemCopy2DAsync( - BBuf, ldB, ABuf, ldA, height, width, A.Stream()); + details::InterdeviceCopy::Copy2DAsync( + ABuf, ldA, BBuf, ldB, height, width, SyncInfoFromMatrix(A)); } #endif // HYDROGEN_HAVE_GPU diff --git a/include/El/blas_like/level1/CopyLocal.hpp b/include/El/blas_like/level1/CopyLocal.hpp index e10d38e598..2c3dbe8e8d 100644 --- a/include/El/blas_like/level1/CopyLocal.hpp +++ b/include/El/blas_like/level1/CopyLocal.hpp @@ -129,8 +129,7 @@ void CopyImpl(Matrix const& A, Matrix& B) syncInfoB); } -#ifdef HYDROGEN_HAVE_CUDA -// If using CUDA, prefer the cudaMemcpy2D implementation. This is +// If using GPU, prefer the (cuda|hip)Memcpy2D implementation. This is // ASYNCHRONOUS with respect to the host. // (Case 1, GPU) // @@ -153,14 +152,11 @@ void CopyImpl(Matrix const& A, Matrix& B) auto syncHelper = MakeMultiSync(syncInfoB, syncInfoA); // Launch the copy - H_CHECK_CUDA( - cudaMemcpy2DAsync(BBuf, ldB*sizeof(T), - ABuf, ldA*sizeof(T), - height*sizeof(T), width, - cudaMemcpyDeviceToDevice, - syncInfoB.stream_)); + gpu::Copy2DIntraDevice(ABuf, ldA, + BBuf, ldB, + height, width, + syncInfoB); } -#endif // HYDROGEN_HAVE_CUDA namespace details { @@ -178,6 +174,29 @@ struct InterdeviceSync SyncInfo gpu_sync_; }; + +template +struct InterdeviceCopy; + +template <> +struct InterdeviceCopy +{ + template + static void Copy2DAsync(Args&&... args) + { + gpu::Copy2DToDevice(std::forward(args)...); + } +}; + +template <> +struct InterdeviceCopy +{ + template + static void Copy2DAsync(Args&&... args) + { + gpu::Copy2DToHost(std::forward(args)...); + } +}; } // These inter-device copy functions are SYNCHRONOUS with respect to @@ -200,8 +219,8 @@ void CopyImpl(Matrix const& A, Matrix& B) details::InterdeviceSync isync(SyncInfoFromMatrix(A), SyncInfoFromMatrix(B)); - InterDeviceCopy::MemCopy2DAsync( - BBuf, ldB, ABuf, ldA, height, width, isync.gpu_sync_.stream_); + details::InterdeviceCopy::Copy2DAsync( + ABuf, ldA, BBuf, ldB, height, width, isync.gpu_sync_); Synchronize(isync.gpu_sync_); // Is this necessary?? } @@ -225,8 +244,8 @@ void CopyImpl(Matrix const& A, details::InterdeviceSync isync(SyncInfoFromMatrix(A), SyncInfoFromMatrix(B)); - InterDeviceCopy::MemCopy2DAsync( - BBuf, ldB, ABuf, ldA, height, width, isync.gpu_sync_.stream_); + details::InterdeviceCopy::Copy2DAsync( + ABuf, ldA, BBuf, ldB, height, width, isync.gpu_sync_); Synchronize(isync.gpu_sync_); // Is this necessary?? } diff --git a/include/El/blas_like/level1/DiagonalScale.hpp b/include/El/blas_like/level1/DiagonalScale.hpp index cc709d9fde..9e2ab95960 100644 --- a/include/El/blas_like/level1/DiagonalScale.hpp +++ b/include/El/blas_like/level1/DiagonalScale.hpp @@ -13,7 +13,7 @@ namespace El { -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void DiagonalScale( LeftOrRight side, Orientation orientation, @@ -47,7 +47,7 @@ void DiagonalScale( LogicError("DiagonalScale: Bad device type."); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void DiagonalScale( @@ -104,13 +104,13 @@ void DiagonalScale(LeftOrRight side, static_cast const&>(d), static_cast&>(A)); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: DiagonalScale(side, orientation, static_cast const&>(d), static_cast&>(A)); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("DiagonalScale: Bad device."); } diff --git a/include/El/blas_like/level1/Dot.hpp b/include/El/blas_like/level1/Dot.hpp index a832e52a0c..a61cf1a8ee 100644 --- a/include/El/blas_like/level1/Dot.hpp +++ b/include/El/blas_like/level1/Dot.hpp @@ -30,12 +30,12 @@ T Dot( const AbstractMatrix& A, const AbstractMatrix& B ) sum = Dot(static_cast&>(A), static_cast&>(B)); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: sum = Dot(static_cast&>(A), static_cast&>(B)); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Unsupported device type."); } diff --git a/include/El/blas_like/level1/EntrywiseFill.hpp b/include/El/blas_like/level1/EntrywiseFill.hpp index 2c3944b211..cf0769ac1b 100644 --- a/include/El/blas_like/level1/EntrywiseFill.hpp +++ b/include/El/blas_like/level1/EntrywiseFill.hpp @@ -23,7 +23,7 @@ void EntrywiseFill( Matrix& A, function func ) } // FIXME: Make proper kernel -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void EntrywiseFill(Matrix &A, function func) { @@ -32,7 +32,7 @@ void EntrywiseFill(Matrix &A, function func) EntrywiseFill(CPU_Mat, std::move(func)); A = CPU_Mat; } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void EntrywiseFill( AbstractDistMatrix& A, function func ) @@ -50,12 +50,12 @@ void EntrywiseFill( AbstractDistMatrix& A, function func ) EL_EXTERN template void EntrywiseFill \ ( AbstractDistMatrix& A, function func ); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU EL_EXTERN template void EntrywiseFill( Matrix&, function); EL_EXTERN template void EntrywiseFill( Matrix&, function); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/include/El/blas_like/level1/Fill.hpp b/include/El/blas_like/level1/Fill.hpp index 5ddbb7894a..0e0e3d0847 100644 --- a/include/El/blas_like/level1/Fill.hpp +++ b/include/El/blas_like/level1/Fill.hpp @@ -9,7 +9,7 @@ #ifndef EL_BLAS_FILL_HPP #define EL_BLAS_FILL_HPP -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #include #endif @@ -49,14 +49,14 @@ void Fill( AbstractMatrix& A, T alpha ) } } break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: hydrogen::Fill_GPU_impl( m, n, alpha, ABuf, ALDim, SyncInfoFromMatrix( - static_cast&>(A)).stream_); + static_cast&>(A))); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Bad device type in Fill"); } diff --git a/include/El/blas_like/level1/Hadamard.hpp b/include/El/blas_like/level1/Hadamard.hpp index f670e81e38..7e641b555c 100644 --- a/include/El/blas_like/level1/Hadamard.hpp +++ b/include/El/blas_like/level1/Hadamard.hpp @@ -9,9 +9,9 @@ #ifndef EL_BLAS_HADAMARD_HPP #define EL_BLAS_HADAMARD_HPP -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #include -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU // C(i,j) := A(i,j) B(i,j) @@ -78,7 +78,7 @@ void Hadamard(AbstractMatrix const& A, AbstractMatrix const& B, } } break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: { auto si_A = SyncInfoFromMatrix( @@ -93,10 +93,10 @@ void Hadamard(AbstractMatrix const& A, AbstractMatrix const& B, hydrogen::Hadamard_GPU_impl(height, width, ABuf, 1, ALDim, BBuf, 1, BLDim, CBuf, 1, CLDim, - si_C.stream_); + si_C); } break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Bad device type for Hadamard."); } diff --git a/include/El/blas_like/level1/IndexDependentMap.hpp b/include/El/blas_like/level1/IndexDependentMap.hpp index 168bdb565b..37195a648c 100644 --- a/include/El/blas_like/level1/IndexDependentMap.hpp +++ b/include/El/blas_like/level1/IndexDependentMap.hpp @@ -52,12 +52,12 @@ void IndexDependentMap( AbstractMatrix& A, function func case Device::CPU: IndexDependentMap(static_cast&>(A), func); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: LogicError("IndexDependentMap: Unsupported device type."); // IndexDependentMap(static_cast&>(A), func); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("IndexDependentMap: Unsupported device type."); } diff --git a/include/El/blas_like/level1/Recv.hpp b/include/El/blas_like/level1/Recv.hpp index 32d1805ee3..f62a9a3249 100644 --- a/include/El/blas_like/level1/Recv.hpp +++ b/include/El/blas_like/level1/Recv.hpp @@ -49,11 +49,11 @@ void Recv(AbstractMatrix& A, mpi::Comm const& comm, int source) case Device::CPU: Recv(static_cast&>(A), std::move(comm), source); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Recv(static_cast&>(A), std::move(comm), source); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Recv: Bad device."); } diff --git a/include/El/blas_like/level1/Round.hpp b/include/El/blas_like/level1/Round.hpp index 80967a6167..99c332d403 100644 --- a/include/El/blas_like/level1/Round.hpp +++ b/include/El/blas_like/level1/Round.hpp @@ -20,11 +20,11 @@ void Round(AbstractMatrix& A) case Device::CPU: Round(static_cast&>(A)); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Round(static_cast&>(A)); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Invalid device type."); } diff --git a/include/El/blas_like/level1/Scale.hpp b/include/El/blas_like/level1/Scale.hpp index 5162d00516..c0c4d0888b 100644 --- a/include/El/blas_like/level1/Scale.hpp +++ b/include/El/blas_like/level1/Scale.hpp @@ -13,7 +13,7 @@ namespace El { -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template >> void Scale(T const& alpha, Matrix& A) { @@ -36,7 +36,7 @@ void Scale(T const&, Matrix&) { LogicError("Scale: Bad device/type combo!"); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template >> @@ -102,11 +102,11 @@ void Scale( S alphaS, AbstractMatrix& A ) case Device::CPU: Scale(alpha, static_cast&>(A)); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Scale(alpha, static_cast&>(A)); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Bad device type in Scale"); } diff --git a/include/El/blas_like/level1/Send.hpp b/include/El/blas_like/level1/Send.hpp index 58ba29fb00..447dbe7fc0 100644 --- a/include/El/blas_like/level1/Send.hpp +++ b/include/El/blas_like/level1/Send.hpp @@ -49,12 +49,12 @@ void Send(AbstractMatrix const& A, mpi::Comm const& comm, int destination) Send(static_cast const&>(A), std::move(comm), destination); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Send(static_cast const&>(A), std::move(comm), destination); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Send: Bad Device."); } diff --git a/include/El/blas_like/level1/SendRecv.hpp b/include/El/blas_like/level1/SendRecv.hpp index 305c023c88..40b5f48c43 100644 --- a/include/El/blas_like/level1/SendRecv.hpp +++ b/include/El/blas_like/level1/SendRecv.hpp @@ -27,14 +27,14 @@ void SendRecv( static_cast&>(B), comm, sendRank, recvRank); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: SendRecv( static_cast const&>(A), static_cast&>(B), comm, sendRank, recvRank); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("SendRecv: Unsupported device."); } @@ -108,14 +108,14 @@ void SendRecv ( const Matrix& A, Matrix& B, mpi::Comm const& comm, \ int sendRank, int recvRank ); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU EL_EXTERN template void SendRecv( Matrix const&, Matrix&, mpi::Comm const&, int, int); EL_EXTERN template void SendRecv( Matrix const&, Matrix&, mpi::Comm const&, int, int); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/include/El/blas_like/level1/Transpose.hpp b/include/El/blas_like/level1/Transpose.hpp index 1f2113d78d..b6b230f70e 100644 --- a/include/El/blas_like/level1/Transpose.hpp +++ b/include/El/blas_like/level1/Transpose.hpp @@ -84,13 +84,13 @@ void Transpose(AbstractMatrix const& A, AbstractMatrix& B, static_cast const&>(A), static_cast&>(B), conjugate); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Transpose( static_cast const&>(A), static_cast&>(B), conjugate); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Bad device for transform."); } @@ -158,7 +158,7 @@ void Transpose( const Matrix& A, Matrix& B, bool conjugate ) } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void Transpose(Matrix const& A, Matrix& B, bool conjugate ) @@ -186,7 +186,7 @@ void Transpose(Matrix const& A, { LogicError("Bad device type!"); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void Transpose @@ -416,7 +416,7 @@ void Adjoint EL_EXTERN template void Transpose( \ Matrix const& A, Matrix& B, bool conjugate); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU EL_EXTERN template void Transpose( Matrix const& A, Matrix& B, bool conjugate); @@ -431,7 +431,7 @@ EL_EXTERN template void Transpose( Matrix& B, bool conjugate); #endif // HYDROGEN_GPU_USE_FP16 -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/include/El/blas_like/level1/TransposeAxpy.hpp b/include/El/blas_like/level1/TransposeAxpy.hpp index 71d074153f..21fb366479 100644 --- a/include/El/blas_like/level1/TransposeAxpy.hpp +++ b/include/El/blas_like/level1/TransposeAxpy.hpp @@ -27,7 +27,7 @@ void TransposeAxpy( static_cast&>(Y), conjugate); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: TransposeAxpy(alphaS, static_cast const&>(X), @@ -35,7 +35,7 @@ void TransposeAxpy( conjugate); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Bad device for TransposeAxpy"); } @@ -106,7 +106,7 @@ void TransposeAxpy } } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template >> void TransposeAxpy(S alphaS, @@ -175,7 +175,7 @@ void TransposeAxpy (S alphaS, { LogicError("TransposeAxpy: Bad type/device combo."); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void TransposeAxpy diff --git a/include/El/blas_like/level1/Zero.hpp b/include/El/blas_like/level1/Zero.hpp index dbeae4fe6e..4cef287ff9 100644 --- a/include/El/blas_like/level1/Zero.hpp +++ b/include/El/blas_like/level1/Zero.hpp @@ -13,7 +13,7 @@ #include #endif -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #include #endif @@ -101,14 +101,14 @@ void Zero( AbstractMatrix& A ) } } break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: hydrogen::Fill_GPU_impl( height, width, TypeTraits::Zero(), ABuf, ALDim, SyncInfoFromMatrix( - static_cast&>(A)).stream_); + static_cast&>(A))); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Bad device type in Zero"); } diff --git a/include/El/blas_like/level1/decl.hpp b/include/El/blas_like/level1/decl.hpp index e41a5de239..dcb9faf4ba 100644 --- a/include/El/blas_like/level1/decl.hpp +++ b/include/El/blas_like/level1/decl.hpp @@ -103,14 +103,14 @@ void InterleaveMatrixUpdate( Ring* B, Int colStrideB, Int rowStrideB, SyncInfo); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void InterleaveMatrixUpdate( Ring alpha, Int localHeight, Int localWidth, Ring const* A, Int colStrideA, Int rowStrideA, Ring* B, Int colStrideB, Int rowStrideB, SyncInfo); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void UpdateWithLocalData( @@ -414,7 +414,7 @@ void PartialRowStridedUnpack( T* B, Int BLDim, SyncInfo ); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template >> void InterleaveMatrix( Int height, Int width, @@ -458,7 +458,7 @@ void PartialRowStridedUnpack( T* B, Int BLDim, SyncInfo ); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template >, typename=void> @@ -622,7 +622,7 @@ namespace El // DiagonalScale // ============= -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template>> void DiagonalScale ( LeftOrRight side, Orientation orientation, @@ -632,7 +632,7 @@ template>, void DiagonalScale ( LeftOrRight side, Orientation orientation, Matrix const& d, Matrix& A ); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void DiagonalScale( @@ -741,10 +741,10 @@ template void EntrywiseFill( Matrix& A, function func ); template void EntrywiseFill( AbstractDistMatrix& A, function func ); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void EntrywiseFill( Matrix& A, function func ); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU // EntrywiseMap // ============ @@ -1597,7 +1597,7 @@ void Transpose ( const Matrix& A, Matrix& B, bool conjugate=false ); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template>> void Transpose ( Matrix const& A, @@ -1610,7 +1610,7 @@ void Transpose ( Matrix const& A, Matrix& B, bool conjugate=false ); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template void Transpose diff --git a/include/El/core.hpp b/include/El/core.hpp index 746cc0756e..173e0e6247 100644 --- a/include/El/core.hpp +++ b/include/El/core.hpp @@ -57,13 +57,16 @@ #include #endif // HYDROGEN_HAVE_GPU -#ifdef HYDROGEN_HAVE_CUDA +#if defined(HYDROGEN_HAVE_CUDA) #include +#include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#endif + #ifdef HYDROGEN_HAVE_CUB -#include +#include #endif // HYDROGEN_HAVE_CUB -#include -#endif // HYDROGEN_HAVE_CUDA // Inject Hydrogen-specific symbols into El namespace El diff --git a/include/El/core/AbstractMatrix/decl.hpp b/include/El/core/AbstractMatrix/decl.hpp index 26d78c9207..b039f59c84 100644 --- a/include/El/core/AbstractMatrix/decl.hpp +++ b/include/El/core/AbstractMatrix/decl.hpp @@ -276,7 +276,7 @@ class AbstractMatrix return static_cast&>(*this); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU H_DEPRECATED("Extremely dangerous. Will be removed soon.") operator Matrix& () { @@ -297,7 +297,7 @@ class AbstractMatrix } return static_cast&>(*this); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU // Single-entry manipulation // ========================= diff --git a/include/El/core/Element/impl.hpp b/include/El/core/Element/impl.hpp index 4680bd2955..9110cdf5a8 100644 --- a/include/El/core/Element/impl.hpp +++ b/include/El/core/Element/impl.hpp @@ -135,7 +135,7 @@ inline void UpdateRealPart( { alpha = float(alpha)+float(beta); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_GPU_USE_FP16 template void UpdateRealPart( Complex& alpha, const Real& beta ) diff --git a/include/El/core/Matrix/decl.hpp b/include/El/core/Matrix/decl.hpp index 4c486692b0..c412f1ee67 100644 --- a/include/El/core/Matrix/decl.hpp +++ b/include/El/core/Matrix/decl.hpp @@ -9,7 +9,14 @@ #ifndef EL_MATRIX_DECL_HPP #define EL_MATRIX_DECL_HPP +#include + #include + +#ifdef HYDROGEN_HAVE_GPU +#include +#endif // HYDROGEN_HAVE_GPU + #include #include @@ -95,13 +102,13 @@ class Matrix : public AbstractMatrix */ Matrix& operator=(Matrix&& A); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU /** @brief Create a copy of a matrix from a GPU matrix */ Matrix(Matrix const& A); /** @brief Assign by copying data from a GPU */ Matrix& operator=(Matrix const& A); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU ///@} /** @name Abstract Copies. */ @@ -286,7 +293,7 @@ SyncInfo SyncInfoFromMatrix(Matrix const& mat) return SyncInfo{}; } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU // GPU version template class Matrix : public AbstractMatrix @@ -467,11 +474,8 @@ class Matrix : public AbstractMatrix /** @name Synchronization semantics */ ///@{ - cudaStream_t Stream() const EL_NO_EXCEPT; - cudaEvent_t Event() const EL_NO_EXCEPT; - - void SetStream(cudaStream_t stream) EL_NO_EXCEPT; - void SetEvent(cudaEvent_t event) EL_NO_EXCEPT; + SyncInfo GetSyncInfo() const EL_NO_EXCEPT; + void SetSyncInfo(SyncInfo const&) EL_NO_EXCEPT; void UpdateMemSyncInfo() EL_NO_EXCEPT { @@ -511,28 +515,23 @@ class Matrix : public AbstractMatrix T* data_=nullptr; - cudaStream_t stream_ = GPUManager::Stream(); - cudaEvent_t event_ = GPUManager::Event(); + SyncInfo sync_info_ = gpu::DefaultSyncInfo(); };// class Matrix template SyncInfo SyncInfoFromMatrix(Matrix const& mat) { - return SyncInfo{mat.Stream(), mat.Event()}; + return mat.GetSyncInfo(); } template void SetSyncInfo( Matrix& mat, SyncInfo const& syncInfo) { - if (syncInfo.stream_ != nullptr) - mat.SetStream(syncInfo.stream_); - if (syncInfo.event_ != nullptr) - mat.SetEvent(syncInfo.event_); - mat.UpdateMemSyncInfo(); + mat.SetSyncInfo(syncInfo); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU } // namespace El diff --git a/include/El/core/Matrix/impl.hpp b/include/El/core/Matrix/impl.hpp index e8ed4dd6e1..df6089ca18 100644 --- a/include/El/core/Matrix/impl.hpp +++ b/include/El/core/Matrix/impl.hpp @@ -11,7 +11,7 @@ #include "impl_cpu.hpp" -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #include "impl_gpu.hpp" #endif diff --git a/include/El/core/Matrix/impl_cpu.hpp b/include/El/core/Matrix/impl_cpu.hpp index 157c951695..b6dab13f24 100644 --- a/include/El/core/Matrix/impl_cpu.hpp +++ b/include/El/core/Matrix/impl_cpu.hpp @@ -9,6 +9,12 @@ #ifndef EL_MATRIX_IMPL_CPU_HPP_ #define EL_MATRIX_IMPL_CPU_HPP_ +#include + +#ifdef HYDROGEN_HAVE_GPU +#include +#endif // HYDROGEN_HAVE_GPU + #include namespace El @@ -58,21 +64,24 @@ Matrix::Matrix(Matrix const& A) ::El::Copy(A, *this); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template Matrix::Matrix(Matrix const& A) : Matrix{A.Height(), A.Width(), A.LDim()} { EL_DEBUG_CSE; - auto stream = GPUManager::Stream(); - H_CHECK_CUDA(cudaMemcpy2DAsync(data_, this->LDim()*sizeof(T), - A.LockedBuffer(), A.LDim()*sizeof(T), - A.Height()*sizeof(T), A.Width(), - cudaMemcpyDeviceToHost, - stream)); - H_CHECK_CUDA(cudaStreamSynchronize(stream)); + auto syncinfo = SyncInfoFromMatrix(A); + gpu::Copy2DToHost( + A.LockedBuffer(), A.LDim(), + data_, this->LDim(), + A.Height(), A.Width(), + syncinfo); + + // Cannot exit until this method has finished or matrix data might + // be invalid. + Synchronize(syncinfo); } -#endif +#endif // HYDROGEN_HAVE_GPU template Matrix::Matrix(Matrix&& A) EL_NO_EXCEPT diff --git a/include/El/core/Matrix/impl_gpu.hpp b/include/El/core/Matrix/impl_gpu.hpp index cc11bcbdfb..aa0450c9eb 100644 --- a/include/El/core/Matrix/impl_gpu.hpp +++ b/include/El/core/Matrix/impl_gpu.hpp @@ -61,13 +61,15 @@ Matrix::Matrix(Matrix const& A) : Matrix{A.Height(), A.Width(), A.LDim()} { EL_DEBUG_CSE; - auto stream = this->Stream(); - H_CHECK_CUDA(cudaMemcpy2DAsync(data_, this->LDim()*sizeof(T), - A.LockedBuffer(), A.LDim()*sizeof(T), - A.Height()*sizeof(T), A.Width(), - cudaMemcpyHostToDevice, - stream)); - H_CHECK_CUDA(cudaStreamSynchronize(stream)); + auto syncinfo = SyncInfoFromMatrix(*this); + + gpu::Copy2DToDevice( + A.LockedBuffer(), A.LDim(), + data_, this->LDim(), + A.Height(), A.Width(), + syncinfo); + + Synchronize(syncinfo); } template @@ -274,12 +276,10 @@ T Matrix::Get(Int i, Int j) const #endif if (i == END) i = this->Height() - 1; if (j == END) j = this->Width() - 1; - auto stream = this->Stream(); + auto syncinfo = SyncInfoFromMatrix(*this); T val; - H_CHECK_CUDA(cudaMemcpyAsync( &val, &data_[i+j*this->LDim()], - sizeof(T), cudaMemcpyDeviceToHost, - stream )); - H_CHECK_CUDA(cudaStreamSynchronize(stream)); + gpu::Copy1DToHost(&data_[i+j*this->LDim()], &val, 1, syncinfo); + Synchronize(syncinfo); return val; } @@ -319,10 +319,10 @@ void Matrix::Set(Int i, Int j, T const& alpha) #endif if (i == END) i = this->Height() - 1; if (j == END) j = this->Width() - 1; - H_CHECK_CUDA(cudaMemcpyAsync(&data_[i+j*this->LDim()], &alpha, - sizeof(T), cudaMemcpyHostToDevice, - stream_ )); - H_CHECK_CUDA(cudaStreamSynchronize(stream_)); + + auto syncinfo = SyncInfoFromMatrix(*this); + gpu::Copy1DToDevice(&alpha, &data_[i+j*this->LDim()], 1, syncinfo); + Synchronize(syncinfo); } template @@ -500,27 +500,16 @@ T& Matrix::operator()(Int i, Int j) } template -cudaStream_t Matrix::Stream() const EL_NO_EXCEPT -{ - return stream_; -} - -template -cudaEvent_t Matrix::Event() const EL_NO_EXCEPT -{ - return event_; -} - -template -void Matrix::SetStream(cudaStream_t stream) EL_NO_EXCEPT +SyncInfo Matrix::GetSyncInfo() const EL_NO_EXCEPT { - stream_ = stream; + return sync_info_; } template -void Matrix::SetEvent(cudaEvent_t event) EL_NO_EXCEPT +void Matrix::SetSyncInfo( + SyncInfo const& si) EL_NO_EXCEPT { - event_ = event; + sync_info_.Merge(si); } #ifdef EL_INSTANTIATE_CORE diff --git a/include/El/core/Memory/decl.hpp b/include/El/core/Memory/decl.hpp index 9a1353c6a5..bd9b03737d 100644 --- a/include/El/core/Memory/decl.hpp +++ b/include/El/core/Memory/decl.hpp @@ -27,7 +27,7 @@ constexpr unsigned DefaultMemoryMode() return 0; } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template <> constexpr unsigned DefaultMemoryMode() { @@ -35,9 +35,9 @@ constexpr unsigned DefaultMemoryMode() return 1; #else return 0; -#endif -} #endif // HYDROGEN_HAVE_CUB +} +#endif // HYDROGEN_HAVE_GPU template class Memory diff --git a/include/El/core/Memory/impl.hpp b/include/El/core/Memory/impl.hpp index 104041a943..b74cf71c2e 100644 --- a/include/El/core/Memory/impl.hpp +++ b/include/El/core/Memory/impl.hpp @@ -14,13 +14,16 @@ #include -#ifdef HYDROGEN_HAVE_CUDA +#if defined(HYDROGEN_HAVE_CUDA) #include #include -#endif // HYDROGEN_HAVE_CUDA +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#include +#endif // defined(HYDROGEN_HAVE_CUDA) #ifdef HYDROGEN_HAVE_CUB -#include +#include #endif #include "decl.hpp" @@ -39,25 +42,29 @@ G* New(size_t size, unsigned int mode, SyncInfo const&) case 0: ptr = static_cast(HostMemoryPool().Allocate(size * sizeof(G))); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case 1: ptr = static_cast(PinnedHostMemoryPool().Allocate(size * sizeof(G))); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU case 2: ptr = new G[size]; break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case 3: { // Pinned memory +#ifdef HYDROGEN_HAVE_CUDA auto error = cudaMallocHost(&ptr, size * sizeof(G)); if (error != cudaSuccess) { RuntimeError("Failed to allocate pinned memory with message: ", "\"", cudaGetErrorString(error), "\""); } +#elif defined(HYDROGEN_HAVE_ROCM) + H_CHECK_HIP(hipHostMalloc(&ptr, size * sizeof(G))); +#endif } break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: RuntimeError("Invalid CPU memory allocation mode"); } return ptr; @@ -68,23 +75,27 @@ void Delete( G*& ptr, unsigned int mode, SyncInfo const& ) { switch (mode) { case 0: HostMemoryPool().Free(ptr); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case 1: PinnedHostMemoryPool().Free(ptr); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU case 2: delete[] ptr; break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case 3: { // Pinned memory +#if defined(HYDROGEN_HAVE_CUDA) auto error = cudaFreeHost(ptr); if (error != cudaSuccess) { RuntimeError("Failed to free pinned memory with message: ", "\"", cudaGetErrorString(error), "\""); } +#elif defined(HYDROGEN_HAVE_ROCM) + H_CHECK_HIP(hipHostFree(ptr)); +#endif } break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: RuntimeError("Invalid CPU memory deallocation mode"); } ptr = nullptr; @@ -97,35 +108,51 @@ void MemZero( G* buffer, size_t numEntries, unsigned int mode, MemZero(buffer, numEntries); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template G* New( size_t size, unsigned int mode, SyncInfo const& syncInfo_ ) { // Allocate memory G* ptr = nullptr; +#if defined(HYDROGEN_HAVE_CUDA) cudaError_t status = cudaSuccess; + cudaError_t const success = cudaSuccess; +#elif defined(HYDROGEN_HAVE_ROCM) + hipError_t status = hipSuccess; + hipError_t const success = hipSuccess; +#endif switch (mode) { +#if defined(HYDROGEN_HAVE_CUDA) case 0: status = cudaMalloc(&ptr, size * sizeof(G)); break; +#elif defined(HYDROGEN_HAVE_ROCM) + case 0: status = hipMalloc(&ptr, size * sizeof(G)); break; +#endif #ifdef HYDROGEN_HAVE_CUB case 1: status = hydrogen::cub::MemoryPool().DeviceAllocate( reinterpret_cast(&ptr), size * sizeof(G), - syncInfo_.stream_); + syncInfo_.Stream()); break; #endif // HYDROGEN_HAVE_CUB default: RuntimeError("Invalid GPU memory allocation mode"); } // Check for errors - if (status != cudaSuccess) + if (status != success) { size_t freeMemory = 0; size_t totalMemory = 0; +#if defined(HYDROGEN_HAVE_CUDA) cudaMemGetInfo(&freeMemory, &totalMemory); + std::string error_string = cudaGetErrorString(status); +#elif defined(HYDROGEN_HAVE_ROCM) + hipMemGetInfo(&freeMemory, &totalMemory); + std::string error_string = hipGetErrorString(status); +#endif RuntimeError("Failed to allocate GPU memory with message: ", - "\"", cudaGetErrorString(status), "\" ", + "\"", error_string, "\" ", "(",size*sizeof(G)," bytes requested, ", freeMemory," bytes available, ", totalMemory," bytes total)"); @@ -138,11 +165,20 @@ template void Delete( G*& ptr, unsigned int mode, SyncInfo const& ) { switch (mode) { +#if defined(HYDROGEN_HAVE_CUDA) case 0: H_CHECK_CUDA(cudaFree(ptr)); break; +#elif defined(HYDROGEN_HAVE_ROCM) + case 0: H_CHECK_HIP(hipFree(ptr)); break; +#endif #ifdef HYDROGEN_HAVE_CUB case 1: +#if defined HYDROGEN_HAVE_CUDA H_CHECK_CUDA( hydrogen::cub::MemoryPool().DeviceFree(ptr)); +#elif defined HYDROGEN_HAVE_ROCM + H_CHECK_HIP( + hydrogen::cub::MemoryPool().DeviceFree(ptr)); +#endif break; #endif // HYDROGEN_HAVE_CUB default: RuntimeError("Invalid GPU memory deallocation mode"); @@ -154,12 +190,18 @@ template void MemZero( G* buffer, size_t numEntries, unsigned int mode, SyncInfo const& syncInfo_ ) { +#if defined(HYDROGEN_HAVE_CUDA) H_CHECK_CUDA( cudaMemsetAsync(buffer, 0x0, numEntries * sizeof(G), - syncInfo_.stream_)); + syncInfo_.Stream())); +#elif defined(HYDROGEN_HAVE_ROCM) + H_CHECK_HIP( + hipMemsetAsync(buffer, 0x0, numEntries * sizeof(G), + syncInfo_.Stream())); +#endif } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU } // namespace @@ -262,7 +304,7 @@ void Memory::Empty() template void Memory::ResetSyncInfo(SyncInfo const& syncInfo) { -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU // FIXME: This treats this case as an error. Alternatively, this // could reallocate memory. See SetMode() below. if ((size_ > 0) && (D == Device::GPU) && (mode_ == 1)) @@ -270,7 +312,7 @@ void Memory::ResetSyncInfo(SyncInfo const& syncInfo) LogicError("Cannot assign new SyncInfo object to " "already-allocated CUB memory."); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU syncInfo_ = syncInfo; } @@ -316,7 +358,7 @@ unsigned int Memory::Mode() const EL_EXTERN template class Memory; // GPU instantiations -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU EL_EXTERN template class Memory; EL_EXTERN template class Memory; #endif diff --git a/include/El/core/MemoryPool.hpp b/include/El/core/MemoryPool.hpp index 098e4f1aca..4dc6c7c91c 100644 --- a/include/El/core/MemoryPool.hpp +++ b/include/El/core/MemoryPool.hpp @@ -1,18 +1,21 @@ #ifndef HYDROGEN_MEMORYPOOL_HPP_ #define HYDROGEN_MEMORYPOOL_HPP_ -#include +#include "El/hydrogen_config.h" +#if defined(HYDROGEN_HAVE_CUDA) +#include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#endif + +#include #include -#include -#include +#include #include +#include +#include #include -#include - -#include "El/hydrogen_config.h" -#ifdef HYDROGEN_HAVE_CUDA -#include -#endif // HYDROGEN_HAVE_CUDA +#include namespace El { @@ -26,7 +29,7 @@ void ThrowRuntimeError(Args&&... args) (void) dummy; throw std::runtime_error(oss.str()); } -} +} // namespace details /** Simple caching memory pool. * This maintains a set of bins that contain allocations of a fixed size. @@ -200,6 +203,32 @@ inline void MemoryPool::do_free(void* ptr) "\"", cudaGetErrorString(error), "\""); } } +#elif defined(HYDROGEN_HAVE_ROCM) +template <> +inline void* MemoryPool::do_allocation(size_t bytes) +{ + void* ptr; + auto error = hipHostMalloc(&ptr, bytes); + if (error != hipSuccess) + { + details::ThrowRuntimeError( + "Failed to allocate HIP pinned memory with message: ", + "\"", hipGetErrorString(error), "\""); + } + return ptr; +} + +template<> +inline void MemoryPool::do_free(void* ptr) +{ + auto error = hipHostFree(ptr); + if (error != hipSuccess) + { + details::ThrowRuntimeError( + "Failed to free HIP pinned memory with message: ", + "\"", hipGetErrorString(error), "\""); + } +} #endif // HYDROGEN_HAVE_CUDA template <> @@ -218,12 +247,12 @@ inline void MemoryPool::do_free(void* ptr) return std::free(ptr); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU /** Get singleton instance of CUDA pinned host memory pool. */ MemoryPool& PinnedHostMemoryPool(); /** Destroy singleton instance of CUDA pinned host memory pool. */ void DestroyPinnedHostMemoryPool(); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU /** Get singleton instance of host memory pool. */ MemoryPool& HostMemoryPool(); /** Destroy singleton instance of host memory pool. */ diff --git a/include/El/core/ProxyDevice.hpp b/include/El/core/ProxyDevice.hpp index 92e34b50e8..88efe62af3 100644 --- a/include/El/core/ProxyDevice.hpp +++ b/include/El/core/ProxyDevice.hpp @@ -24,12 +24,12 @@ class AbstractMatrixReadDeviceProxy proxy_ = new proxy_type{ static_cast const&>(A)}; break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: proxy_ = new proxy_type{ static_cast const&>(A)}; break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("AbstractMatrixReadDeviceProxy: Bad device."); } diff --git a/include/El/core/View/impl.hpp b/include/El/core/View/impl.hpp index cf96fe2384..c3885a5f17 100644 --- a/include/El/core/View/impl.hpp +++ b/include/El/core/View/impl.hpp @@ -66,12 +66,12 @@ void View(AbstractMatrix& A, AbstractMatrix& B) View(static_cast&>(A), static_cast&>(B)); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: View(static_cast&>(A), static_cast&>(B)); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Unsupported device type."); } @@ -88,12 +88,12 @@ void LockedView(AbstractMatrix& A, const AbstractMatrix& B) LockedView(static_cast&>(A), static_cast&>(B)); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: LockedView(static_cast&>(A), static_cast&>(B)); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Unsupported device type."); } @@ -441,12 +441,12 @@ void View(AbstractMatrix& A, AbstractMatrix& B, View(static_cast&>(A), static_cast&>(B), I, J); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: View(static_cast&>(A), static_cast&>(B), I, J); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Unsupported device type."); } @@ -464,12 +464,12 @@ void LockedView(AbstractMatrix& A, AbstractMatrix const& B, LockedView(static_cast&>(A), static_cast&>(B), I, J); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: LockedView(static_cast&>(A), static_cast&>(B), I, J); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Unsupported device type."); } diff --git a/include/El/core/imports/aluminum.hpp b/include/El/core/imports/aluminum.hpp index 3705916c98..fbb4674dae 100644 --- a/include/El/core/imports/aluminum.hpp +++ b/include/El/core/imports/aluminum.hpp @@ -135,7 +135,7 @@ struct BackendsForDeviceT };// struct BackendsForDeviceT // Prefer the NCCL2 backend -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template <> struct BackendsForDeviceT { @@ -151,18 +151,18 @@ struct BackendsForDeviceT #endif // HYDROGEN_HAVE_AL_MPI_CUDA >; };// struct BackendsForDeviceT -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU // Helper using statement template using BackendsForDevice = typename BackendsForDeviceT::type; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU using AllAluminumBackends = Join, BackendsForDevice>; #else using AllAluminumBackends = BackendsForDevice; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template struct DeviceForBackendT; @@ -173,7 +173,7 @@ struct DeviceForBackendT constexpr static Device value = Device::CPU; }; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #ifdef HYDROGEN_HAVE_NCCL2 template <> struct DeviceForBackendT @@ -188,7 +188,7 @@ struct DeviceForBackendT constexpr static Device value = Device::GPU; }; #endif // HYDROGEN_HAVE_AL_MPI_CUDA -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template constexpr Device DeviceForBackend() @@ -262,16 +262,13 @@ template <> struct SyncInfoManager { SyncInfoManager(std::string const& backend_name) + : si_{CreateNewSyncInfo()} { - H_CHECK_CUDA( - cudaEventCreateWithFlags(&si_.event_, cudaEventDisableTiming)); - H_CHECK_CUDA( - cudaStreamCreateWithFlags(&si_.stream_, cudaStreamNonBlocking)); #ifdef HYDROGEN_HAVE_NVPROF // Name the stream for debugging purposes std::string const stream_name = "H: Comm (" + backend_name + ")"; - nvtxNameCudaStreamA(si_.stream_, stream_name.c_str()); + nvtxNameCudaStreamA(si_.Stream(), stream_name.c_str()); #else (void) backend_name; #endif // HYDROGEN_HAVE_NVPROF @@ -280,10 +277,7 @@ struct SyncInfoManager { try { - H_CHECK_CUDA( - cudaEventDestroy(si_.event_)); - H_CHECK_CUDA( - cudaStreamDestroy(si_.stream_)); + DestroySyncInfo(si_); } catch (std::exception const& e) { diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 9f815aee64..d51c4d1384 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -143,6 +143,36 @@ struct Types static void Destroy(); }; +// Silence clang warnings. These are ETI'd in src/core/mpi_register.hpp. +#if !defined H_INSTANTIATING_MPI_TYPES_STRUCT +extern template struct Types; +extern template struct Types; +extern template struct Types; +extern template struct Types; +#ifdef EL_USE_64BIT_INTS +extern template struct Types; // Avoid conflict with Int +#endif +extern template struct Types; +extern template struct Types; +#ifndef EL_USE_64BIT_INTS +extern template struct Types; // Avoid conflict with Int +#endif + +#define PROTO(T) \ + extern template struct Types; \ + extern template struct Types>; \ + extern template struct Types>; + +#define EL_ENABLE_DOUBLEDOUBLE +#define EL_ENABLE_QUADDOUBLE +#define EL_ENABLE_QUAD +#define EL_ENABLE_BIGINT +#define EL_ENABLE_BIGFLOAT +#define EL_ENABLE_HALF +#include +#undef PROTO +#endif // !defined H_INSTANTIATING_MPI_TYPES_STRUCT + template struct MPIBaseHelper { typedef T value; }; template diff --git a/include/El/core/imports/mpi/aluminum_comm.hpp b/include/El/core/imports/mpi/aluminum_comm.hpp index 4e892df2d3..04a0b71bc2 100644 --- a/include/El/core/imports/mpi/aluminum_comm.hpp +++ b/include/El/core/imports/mpi/aluminum_comm.hpp @@ -71,11 +71,11 @@ inline bool SyncInfoEquiv(SyncInfo const&, return true; } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU inline bool SyncInfoEquiv(SyncInfo const& a, SyncInfo const& b) EL_NO_EXCEPT { - return a.stream_ == b.stream_; + return a.Stream() == b.Stream(); } #endif @@ -221,14 +221,14 @@ class AluminumComm return std::make_shared(comm); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template std::shared_ptr MakeWithSyncInfo( MPI_Comm comm, SyncInfo const& syncinfo) const { - return std::make_shared(comm, syncinfo.stream_); + return std::make_shared(comm, syncinfo.Stream()); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU }; // class AluminumComm }// namespace mpi diff --git a/include/El/core/imports/mpi/meta.hpp b/include/El/core/imports/mpi/meta.hpp index baf34d5e8c..1686e76d46 100644 --- a/include/El/core/imports/mpi/meta.hpp +++ b/include/El/core/imports/mpi/meta.hpp @@ -30,7 +30,7 @@ namespace mpi template struct IsMpiDeviceValidType : IsDeviceValidType {}; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU // Signed integer types template <> struct IsMpiDeviceValidType : std::true_type {}; @@ -57,7 +57,7 @@ struct IsMpiDeviceValidType : std::true_type {}; template <> struct IsMpiDeviceValidType : std::true_type {}; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #ifdef HYDROGEN_HAVE_ALUMINUM namespace internal diff --git a/include/El/macros/DeviceGuardAndPayload.h b/include/El/macros/DeviceGuardAndPayload.h index 2c6645361c..2b7c08bb61 100644 --- a/include/El/macros/DeviceGuardAndPayload.h +++ b/include/El/macros/DeviceGuardAndPayload.h @@ -30,7 +30,7 @@ ELSEIF_GUARD_AND_PAYLOAD(STAR,VR ,ELEMENT,Device::CPU) ELSEIF_GUARD_AND_PAYLOAD(VC, STAR,ELEMENT,Device::CPU) ELSEIF_GUARD_AND_PAYLOAD(VR, STAR,ELEMENT,Device::CPU) -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU ELSEIF_GUARD_AND_PAYLOAD(CIRC,CIRC,ELEMENT,Device::GPU) ELSEIF_GUARD_AND_PAYLOAD(MC, MR ,ELEMENT,Device::GPU) ELSEIF_GUARD_AND_PAYLOAD(MC, STAR,ELEMENT,Device::GPU) @@ -45,7 +45,7 @@ ELSEIF_GUARD_AND_PAYLOAD(STAR,VC ,ELEMENT,Device::GPU) ELSEIF_GUARD_AND_PAYLOAD(STAR,VR ,ELEMENT,Device::GPU) ELSEIF_GUARD_AND_PAYLOAD(VC, STAR,ELEMENT,Device::GPU) ELSEIF_GUARD_AND_PAYLOAD(VR, STAR,ELEMENT,Device::GPU) -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU ELSEIF_GUARD_AND_PAYLOAD(CIRC,CIRC,BLOCK,Device::CPU) ELSEIF_GUARD_AND_PAYLOAD(MC, MR ,BLOCK,Device::CPU) diff --git a/include/hydrogen/Device.hpp b/include/hydrogen/Device.hpp index 69ac5d2eb5..ecc789851f 100644 --- a/include/hydrogen/Device.hpp +++ b/include/hydrogen/Device.hpp @@ -149,5 +149,23 @@ using SameDevice = EnumSame; */ template struct InterDeviceCopy; +// These should replace the InterDeviceCopy struct. +#if 0 +template +void MemCopy1DAsync( + T const* __restrict__ const src, + T * __restrict__ const dest, + size_t const size, + SyncInfo const& srcSyncInfo, + SyncInfo const& destSyncInfo); + +template +void MemCopy2DAsync( + T const* __restrict__ const src, size_t const src_ldim, + T * __restrict__ const dest, size_t const dest_ldim, + size_t const height, size_t const width, + SyncInfo const& srcSyncInfo, + SyncInfo const& destSyncInfo); +#endif // 0 }// namespace hydrogen #endif // EL_CORE_DEVICE_HPP_ diff --git a/include/hydrogen/Error.hpp b/include/hydrogen/Error.hpp new file mode 100644 index 0000000000..85ccd8e770 --- /dev/null +++ b/include/hydrogen/Error.hpp @@ -0,0 +1,97 @@ +#ifndef HYDROGEN_ERROR_HPP_ +#define HYDROGEN_ERROR_HPP_ + +#include +#include +#include +#include + +// "Basic exceptions" are those that are constructible with their +// "what string", similar to std::runtime_error and std::logic_error. + +#define H_THROW_BASIC_ASSERT_EXCEPTION(cond,excptn, msg) \ + do \ + { \ + std::ostringstream tbe_oss__; \ + tbe_oss__ << "Assertion\n\n" \ + << " " #cond << "\n\n" \ + << "in function\n\n" \ + << " " << H_PRETTY_FUNCTION << "\n\n" \ + << "failed!\n\n" \ + << "{\n" \ + << " File: " << __FILE__ << "\n" \ + << " Line: " << __LINE__ << "\n" \ + << " Mesg: " << msg << "\n" \ + << "}\n"; \ + ::hydrogen::break_on_me(); \ + throw excptn(tbe_oss__.str()); \ + } while (false) + +#define H_REPORT_DTOR_EXCEPTION_AND_TERMINATE(excptn) \ + do \ + { \ + std::ostringstream dtor_excpt_oss; \ + dtor_excpt_oss << "An exception was detected in a destructor!\n\n" \ + << "File: " << __FILE__ << "\n" \ + << "Line: " << __LINE__ << "\n" \ + << "Function: " << H_PRETTY_FUNCTION << "\n" \ + << "Exception:\n\n" << excptn.what() << "\n\n" \ + << "Now calling std::terminate(). Good bye.\n"; \ + std::cerr << dtor_excpt_oss.str() << std::endl; \ + ::hydrogen::break_on_me(); \ + } while (false) + +// +// ASSERTIONS +// + +#define H_ASSERT(cond, excptn, msg) \ + if (!(cond)) \ + H_THROW_BASIC_ASSERT_EXCEPTION(cond, excptn, msg) + +#define H_ASSERT_FALSE(cond, excptn, msg) \ + if (cond) \ + H_THROW_BASIC_ASSERT_EXCEPTION(!(cond), excptn, msg) + + +// +// Exception classes +// + +// Really, "basic exceptions" are just those that have no data and +// forward all their arguments to their parent. +#define H_ADD_BASIC_EXCEPTION_CLASS(name, parent) \ + struct name : parent \ + { \ + template \ + name(Ts&&... args) \ + : parent(std::forward(args)...) \ + {} \ + } + +namespace hydrogen +{ + +/** @class RuntimeError + * @brief The base exception for runtime errors thrown by Hydrogen. + * + * Runtime errors are those that are due to factors external to the + * program. + */ +//H_ADD_BASIC_EXCEPTION_CLASS(RuntimeError, std::runtime_error); + +/** @class LogicError + * @brief The base exception for logic errors thrown by Hydrogen. + * + * Logic errors are those due to factors internal to the program and + * are more likely to be preventable than RuntimeErrors. + */ +//H_ADD_BASIC_EXCEPTION_CLASS(LogicError, std::logic_error); + +/** @brief A no-op that can be set as a predictable breakpoint in a + * debugger. + */ +void break_on_me(); + +}// namespace hydrogen +#endif /* HYDROGEN_ERROR_HPP_ */ diff --git a/include/hydrogen/MultiSync.hpp b/include/hydrogen/MultiSync.hpp new file mode 100644 index 0000000000..ab8900d044 --- /dev/null +++ b/include/hydrogen/MultiSync.hpp @@ -0,0 +1,80 @@ +#ifndef HYDROGEN_MULTISYNC_HPP_ +#define HYDROGEN_MULTISYNC_HPP_ + +#include "Device.hpp" +#include "SyncInfoBase.hpp" +#include "SynchronizeAPI.hpp" +#include "meta/IndexSequence.hpp" + +#include + +namespace hydrogen +{ + +/** \class MultiSync + * \brief RAII class to wrap a bunch of SyncInfo objects. + * + * Provides basic synchronization for the common case in which an + * operation may act upon objects that exist on multiple distinct + * synchronous processing elements (e.g., cudaStreams) but actual + * computation can only occur on one of them. + * + * Constructing an object of this class will cause the master + * processing element to wait on the others, asynchronously with + * respect to the CPU, if possible. Symmetrically, destruction of + * this object will cause the other processing elements to wait on + * the master processing element, asynchronously with respect to the + * CPU, if possible. + * + * The master processing element is assumed to be the first SyncInfo + * passed into the constructor. + */ +template +class MultiSync +{ + using sync_tuple_type = std::tuple...>; + using sync_master_type = + typename std::tuple_element<0, sync_tuple_type>::type; +public: + MultiSync(SyncInfo const&... syncInfos) + : syncInfos_{syncInfos...} + { + MasterWaitOnAll(syncInfos...); + } + + ~MultiSync() + { + DTorImpl_(MakeIndexSequence()); + } + + /** @brief Implicitly convert to the master. + * + * This is to be able to pass a multisync in place of a SyncInfo + * object. It is common to create a MultiSync and then pass its + * master to a bunch of other calls. This simplifies things by + * not needing to store an external reference to the master + * SyncInfo. + */ + operator sync_master_type const& () const noexcept + { + return std::get<0>(syncInfos_); + } + +private: + template + void DTorImpl_(IndexSequence) + { + AllWaitOnMaster(std::get(syncInfos_)...); + } + + sync_tuple_type syncInfos_; +};// class MultiSync + +template +auto MakeMultiSync(SyncInfo const&... syncInfos) -> MultiSync +{ + return MultiSync(syncInfos...); +} + +}// namespace hydrogen +#endif // HYDROGEN_MULTISYNC_HPP_ diff --git a/include/hydrogen/SyncInfo.hpp b/include/hydrogen/SyncInfo.hpp index 9cf228fe6b..4982e72df5 100644 --- a/include/hydrogen/SyncInfo.hpp +++ b/include/hydrogen/SyncInfo.hpp @@ -1,192 +1,8 @@ -#ifndef EL_CORE_SYNCINFO_HPP_ -#define EL_CORE_SYNCINFO_HPP_ +#ifndef HYDROGEN_SYNCINFO_HPP_ +#define HYDROGEN_SYNCINFO_HPP_ -#include -#include +#include "SyncInfoAllDecl.hpp" +#include "SynchronizeAPI.hpp" +#include "MultiSync.hpp" -#ifdef HYDROGEN_HAVE_CUDA -#include -#endif // HYDROGEN_HAVE_CUDA - -namespace hydrogen -{ - -/** \class SyncInfo - * \brief Manage device-specific synchronization information. - * - * Device-specific synchronization information. For CPUs, this is - * empty since all CPU operations are synchronous with respect to the - * host. For GPUs, this will be a stream and an associated event. - * - * The use-case for this is to cope with the matrix-free part of the - * interface. Many of the copy routines have the paradigm that they - * take Matrixs as arguments and then the host will organize and - * dispatch subkernels that operate on data buffers, i.e., T[] - * data. In the GPU case, for example, this provides a lightweight - * way to pass the CUDA stream through the T* interface without an - * entire matrix (which, semantically, may not make sense). - * - * This also might be useful for interacting with - * Aluminum/MPI/NCCL/whatever. It essentially enables tagged - * dispatch, where the tags possibly contain some extra - * device-specific helpers. - */ -template struct SyncInfo -{ - SyncInfo() {} -};// struct SyncInfo - -// Adding synchronization points is generally a no-op -template -void AddSynchronizationPoint(SyncInfo... As) -{} - -template -void Synchronize(SyncInfo const&) -{} - -#ifdef HYDROGEN_HAVE_CUDA - -template <> -struct SyncInfo -{ - SyncInfo() : SyncInfo{GPUManager::Stream(), GPUManager::Event()} {} - - SyncInfo(cudaStream_t stream, cudaEvent_t event) - : stream_{stream}, event_{event} {} - - cudaStream_t stream_; - cudaEvent_t event_; -};// struct SyncInfo - - -inline void AddSynchronizationPoint(SyncInfo const& syncInfo) -{ - H_CHECK_CUDA(cudaEventRecord(syncInfo.event_, syncInfo.stream_)); -} - -inline void AddSynchronizationPoint( - SyncInfo const& A, SyncInfo const& B) -{ - throw std::logic_error("I don't know what should happen here."); -} - -inline void AddSynchronizationPoint( - SyncInfo const& A, SyncInfo const& B) -{ - throw std::logic_error("I don't know what should happen here."); -} - -// This captures the work done on A and forces B to wait for completion -inline void AddSynchronizationPoint( - SyncInfo const& A, SyncInfo const& B) -{ - if (A.stream_ != B.stream_) - { - AddSynchronizationPoint(A); - H_CHECK_CUDA(cudaStreamWaitEvent(B.stream_, A.event_, 0)); - } -} - -// This captures the work done on A and forces B and C to wait for completion -inline void AddSynchronizationPoint( - SyncInfo const& A, - SyncInfo const& B, SyncInfo const& C) -{ - bool const ABdiff = (A.stream_ != B.stream_); - bool const ACdiff = (A.stream_ != C.stream_); - - if (ABdiff || ACdiff) - AddSynchronizationPoint(A); - - if (ABdiff) - H_CHECK_CUDA(cudaStreamWaitEvent(B.stream_, A.event_, 0)); - - if (ACdiff) - H_CHECK_CUDA(cudaStreamWaitEvent(C.stream_, A.event_, 0)); -} - -inline void Synchronize(SyncInfo const& syncInfo) -{ - H_CHECK_CUDA(cudaStreamSynchronize(syncInfo.stream_)); -} - -#endif // HYDROGEN_HAVE_CUDA - -template -void AllWaitOnMaster( - SyncInfo const& Master, SyncInfo const&... Others) -{ - AddSynchronizationPoint(Master, Others...); -} - -template -void MasterWaitOnAll(SyncInfo const& Master) -{} - -template -void MasterWaitOnAll( - SyncInfo const& Master, SyncInfo const& Other, - SyncInfo const&... others) -{ - AddSynchronizationPoint(Other, Master); - MasterWaitOnAll(Master, others...); -} - -/** \class MultiSync - * \brief RAII class to wrap a bunch of SyncInfo objects. - * - * Provides basic synchronization for the common case in which an - * operation may act upon objects that exist on multiple distinct - * synchronous processing elements (e.g., cudaStreams) but actual - * computation can only occur on one of them. - * - * Constructing an object of this class will cause the master - * processing element to wait on the others, asynchronously with - * respect to the CPU, if possible. Symmetrically, destruction of - * this object will cause the other processing elements to wait on - * the master processing element, asynchronously with respect to the - * CPU, if possible. - * - * The master processing element is assumed to be the first SyncInfo - * passed into the constructor. - */ -template -class MultiSync -{ -public: - MultiSync(SyncInfo const&... syncInfos) - : syncInfos_{syncInfos...} - { - SyncMasterToAll_(MakeIndexSequence()); - } - - ~MultiSync() - { - SyncAllToMaster_(MakeIndexSequence()); - } -private: - - template - void SyncMasterToAll_(IndexSequence) - { - MasterWaitOnAll(std::get(syncInfos_)...); - } - - template - void SyncAllToMaster_(IndexSequence) - { - AllWaitOnMaster(std::get(syncInfos_)...); - } - - std::tuple...> syncInfos_; -};// class MultiSync - -template -auto MakeMultiSync(SyncInfo const&... syncInfos) -> MultiSync -{ - return MultiSync(syncInfos...); -} - -}// namespace hydrogen -#endif // EL_CORE_SYNCINFO_HPP_ +#endif // HYDROGEN_SYNCINFO_HPP_ diff --git a/include/hydrogen/SyncInfoAllDecl.hpp b/include/hydrogen/SyncInfoAllDecl.hpp new file mode 100644 index 0000000000..023edf897a --- /dev/null +++ b/include/hydrogen/SyncInfoAllDecl.hpp @@ -0,0 +1,12 @@ +#ifndef HYDROGEN_SYNCINFOALLDECL_HPP_ +#define HYDROGEN_SYNCINFOALLDECL_HPP_ + +#include + +#include "SyncInfoBase.hpp" + +#ifdef HYDROGEN_HAVE_GPU +#include "device/gpu/SyncInfo.hpp" +#endif // HYDROGEN_HAVE_GPU + +#endif // HYDROGEN_SYNCINFOALLDECL_HPP_ diff --git a/include/hydrogen/SyncInfoBase.hpp b/include/hydrogen/SyncInfoBase.hpp new file mode 100644 index 0000000000..72b0ca54ac --- /dev/null +++ b/include/hydrogen/SyncInfoBase.hpp @@ -0,0 +1,129 @@ +#ifndef HYDROGEN_SYNCINFOBASE_HPP_ +#define HYDROGEN_SYNCINFOBASE_HPP_ + +#include + +#include "Device.hpp" + +namespace hydrogen +{ + +/** \class SyncInfo + * \brief Manage device-specific synchronization information. + * + * Device-specific synchronization information. For CPUs, this is + * empty since all CPU operations are synchronous with respect to the + * host. For GPUs, this will be a stream and an associated event. + * + * The use-case for this is to cope with the matrix-free part of the + * interface. Many of the copy routines have the paradigm that they + * take Matrixs as arguments and then the host will organize and + * dispatch subkernels that operate on data buffers, i.e., T[] + * data. In the GPU case, for example, this provides a lightweight + * way to pass the CUDA stream through the T* interface without an + * entire matrix (which, semantically, may not make sense). + * + * This also might be useful for interacting with + * Aluminum/MPI/NCCL/whatever. It essentially enables tagged + * dispatch, where the tags possibly contain some extra + * device-specific helpers. + */ +template +class SyncInfo; + +template <> +class SyncInfo +{ +public: + SyncInfo() noexcept = default; + ~SyncInfo() noexcept = default; +};// struct SyncInfo + +template +bool operator==(SyncInfo const&, SyncInfo const&) +{ + return true; +} + +template +bool operator!=(SyncInfo const&, SyncInfo const&) +{ + return false; +} + +template +bool operator==(SyncInfo const&, SyncInfo const&) +{ + return false; +} + +template +bool operator!=(SyncInfo const&, SyncInfo const&) +{ + return true; +} + +/** @brief Get a new instance of a certain SyncInfo class. + * + * For CPU, this will be empty, as usual. For GPU, this will have a + * *new* stream and event. + */ +template +SyncInfo CreateNewSyncInfo(); + +/** @brief Create a new CPU SyncInfo object. */ +template <> +inline SyncInfo CreateNewSyncInfo() +{ + return SyncInfo{}; +} + +/** @brief Reset any internal state in the SyncInfo object. + * + * For CPU, this will do nothing. For GPU, this will destroy the + * stream and event. + */ +template +void DestroySyncInfo(SyncInfo&); + +/** @brief Destroy the CPU SyncInfo. */ +inline void DestroySyncInfo(SyncInfo&) noexcept {} + +/** @brief Synchronize the SyncInfo with the main (CPU) thread. */ +template +void Synchronize(SyncInfo const&); + +inline void Synchronize(SyncInfo const&) {} + +/** @brief Add information to the SyncInfo object identifying this + * execution point. + */ +template +void AddSynchronizationPoint( + SyncInfo const& master, + SyncInfo const&... others); + +inline void AddSynchronizationPoint(SyncInfo const&) +{} + +inline void AddSynchronizationPoint(SyncInfo const&, + SyncInfo const&) +{} + +inline void AddSynchronizationPoint(SyncInfo const&, + SyncInfo const&, + SyncInfo const&) +{} + +namespace details +{ +template +void AddSyncPoint(SyncInfo const&, SyncInfo const&); + +inline void AddSyncPoint(SyncInfo const&, + SyncInfo const&) noexcept +{} + +}// namespace details +}// namespace hydrogen +#endif // HYDROGEN_SYNCINFOBASE_HPP_ diff --git a/include/hydrogen/SynchronizeAPI.hpp b/include/hydrogen/SynchronizeAPI.hpp new file mode 100644 index 0000000000..339bcf355e --- /dev/null +++ b/include/hydrogen/SynchronizeAPI.hpp @@ -0,0 +1,41 @@ +#ifndef HYDROGEN_SYNCHRONIZEAPI_HPP_ +#define HYDROGEN_SYNCHRONIZEAPI_HPP_ + +#include "SyncInfo.hpp" + +namespace hydrogen +{ + +// This synchronizes the additional SyncInfos to the "master". That +// is, the execution streams described by the "others" will wait +// for the "master" stream. +template +void AddSynchronizationPoint( + SyncInfo const& master, + SyncInfo const&... others) +{ + AddSynchronizationPoint(master); + + int dummy[] = { (details::AddSyncPoint(master, others), 0)... }; + (void) dummy; +} + +template +void AllWaitOnMaster( + SyncInfo const& master, SyncInfo const&... others) +{ + AddSynchronizationPoint(master, others...); +} + +template +void MasterWaitOnAll( + SyncInfo const& master, + SyncInfo const&... others) +{ + int dummy[] = { + (AddSynchronizationPoint(others, master), 0)...}; + (void) dummy; +} + +}// namespace hydrogen +#endif // HYDROGEN_SYNCHRONIZEAPI_HPP_ diff --git a/include/hydrogen/blas/BLAS_Common.hpp b/include/hydrogen/blas/BLAS_Common.hpp index 18b93a2a31..80ab5978a9 100644 --- a/include/hydrogen/blas/BLAS_Common.hpp +++ b/include/hydrogen/blas/BLAS_Common.hpp @@ -16,9 +16,12 @@ enum class BLAS_Op AXPY, COPY, DGMM, + DOT, GEAM, GEMM, + GEMMSTRIDEDBATCHED, GEMV, + NRM2, SCAL, /** @brief Axpy for 2D data with leading dimension */ AXPY2D, @@ -74,5 +77,17 @@ enum class SideMode RIGHT, }; +/** @brief Describes where pointers point. */ +enum class PointerMode +{ + HOST, + DEVICE, +};// enum class PointerMode + +namespace gpu_blas +{ +/** @brief Set the pointer mode of the underlying library. */ +void SetPointerMode(PointerMode mode); +} }// namespace hydrogen #endif // HYDROGEN_BLAS_COMMON_HPP_ diff --git a/include/hydrogen/blas/GPU_BLAS_decl.hpp b/include/hydrogen/blas/GPU_BLAS_decl.hpp index 2eef6440c4..6406110c73 100644 --- a/include/hydrogen/blas/GPU_BLAS_decl.hpp +++ b/include/hydrogen/blas/GPU_BLAS_decl.hpp @@ -256,6 +256,49 @@ void Copy(SizeT num_rows, SizeT num_cols, T* B, SizeT row_stride_B, SizeT ldb, SyncInfo const& syncinfo); +/** @brief A dot-product operation for 1-D memory. + * + * @tparam T (Inferred) The type of data. + * @tparam SizeT (Inferred) The type used to express size information. + * + * @param num_entries The number of entries in X and Y. + * @param X The first vector (device memory). + * @param stride_X The stride of X. + * @param Y The second vector (device memory). + * @param stride_Y The stride of Y. + * @param result The result of the dot product (host or device memory). + * @param[in] syncinfo The synchronization information for this + * operation. + * + * @ingroup device_blas + */ +template +void Dot(SizeT num_entries, + T const* X, SizeT stride_X, + T const* Y, SizeT stride_Y, + T* result, + SyncInfo const& syncinfo); + +/** @brief Computes the 2-norm of 1-D memory. + * + * @tparam T (Inferred) The type of data. + * @tparam SizeT (Inferred) The type used to express size information. + * + * @param num_entries The number of entries in X. + * @param X The vector (device memory). + * @param stride_X The stride of X. + * @param result The result of the dot product (host or device memory). + * @param[in] syncinfo The synchronization information for this + * operation. + * + * @ingroup device_blas + */ +template +void Nrm2(SizeT num_entries, + T const* X, SizeT stride_X, + T* result, + SyncInfo const& syncinfo); + /** @brief 1-D Scale operation in GPU memory. * * This is in-place scaling: @@ -391,6 +434,53 @@ void Gemm( T* C, SizeT ldc, SyncInfo const& syncinfo); +/** @brief Batched, strided matrix-matrix product in GPU memory. + * + * @todo Write documentation. + * + * @tparam T (Inferred) The type of the data. Should be a field. + * @tparam SizeT (Inferred) The type used to express size information. + * @tparam StrideT (Inferred) The type used to express stride information. + * + * @param[in] transpA The operation flag for `A` indicating `NORMAL`, + * `TRANSPOSE`, or `CONJ_TRANSPOSE`. + * @param[in] transpB The operation flag for `B` indicating `NORMAL`, + * `TRANSPOSE`, or `CONJ_TRANSPOSE`. + * @param[in] m The number of rows in `op(A)` and C. + * @param[in] n The number of columns in `op(B)` and C. + * @param[in] k The number of columns in `op(A)` and rows in `op(B)`. + * @param[in] alpha The scaling term on the multiplicative term. + * @param[in] A A matrix in column-major format. + * @param[in] lda The leading dimension of A. + * @param[in] strideA The between A matrices. + * @param[in] B A matrix in column-major format. + * @param[in] ldb The leading dimension of B + * @param[in] strideB The between B matrices. + * @param[in] beta The scaling applied to the input value of the + * target matrix. + * @param[in,out] C The target matrix. Inital values are scaled by + * beta and updated with the result of the product. + * @param[in] ldc The leading dimension of C. + * @param[in] strideC The between C matrices. + * @param[in] batchCount The number of GEMMs in the batch. + * @param[in] syncinfo The synchronization information for this + * operation. + * + * @ingroup device_blas + */ +template +void GemmStridedBatched( + TransposeMode transpA, TransposeMode transpB, + SizeT m, SizeT n, SizeT k, + T const& alpha, + T const* A, SizeT lda, StrideT strideA, + T const* B, SizeT ldb, StrideT strideB, + T const& beta, + T* C, SizeT ldc, StrideT strideC, + SizeT batchCount, + SyncInfo const& syncinfo); + + ///@} /** @name BLAS-like Extension Routines */ ///@{ diff --git a/include/hydrogen/blas/GPU_BLAS_impl.hpp b/include/hydrogen/blas/GPU_BLAS_impl.hpp index 82e7caf2a5..14cfbd0656 100644 --- a/include/hydrogen/blas/GPU_BLAS_impl.hpp +++ b/include/hydrogen/blas/GPU_BLAS_impl.hpp @@ -40,7 +40,7 @@ namespace gpu_blas_impl = hydrogen::cublas; // needs. #define GPU_BLAS_USE_ROCBLAS -#include +#include namespace gpu_blas_impl = hydrogen::rocblas; @@ -340,7 +340,7 @@ void Axpy2DImpl(SizeT nrows, SizeT ncols, { Axpy_GPU_impl(nrows, ncols, alpha, A, SizeT(1), lda, - B, SizeT(1), ldb, si.stream_); + B, SizeT(1), ldb, si); } template const& si) { Axpy_GPU_impl( - transpA, nrows, ncols, alpha, A, lda, B, ldb, si.stream_); + transpA, nrows, ncols, alpha, A, lda, B, ldb, si); } template const& si) { - Copy_GPU_impl(size, X, incx, Y, incy, si.stream_); + Copy_GPU_impl(size, X, incx, Y, incy, si); } template const& si) { - Scale_GPU_impl(size, alpha, X, incx, si.stream_); + Scale_GPU_impl(size, alpha, X, incx, si); } template const& si) { - Scale_GPU_impl(nrows, ncols, alpha, A, lda, si.stream_); + Scale_GPU_impl(nrows, ncols, alpha, A, lda, si); } // diff --git a/include/hydrogen/blas/gpu/Axpy.hpp b/include/hydrogen/blas/gpu/Axpy.hpp index 001d7eabbd..34e4364534 100644 --- a/include/hydrogen/blas/gpu/Axpy.hpp +++ b/include/hydrogen/blas/gpu/Axpy.hpp @@ -5,7 +5,11 @@ #include #include -#include +#ifdef HYDROGEN_HAVE_CUDA +#include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#endif #include @@ -20,27 +24,27 @@ namespace hydrogen * @tparam T (Inferred) The type of data. Must be the same for source * and destination matrices. * - * @param num_rows The number of rows in the matrix - * @param num_cols The number of columns in the matrix - * @param alpha The scaling factor - * @param src The source matrix, in column-major ordering. Must not - * overlap with the destination matrix. - * @param src_row_stride The number of `T`s between rows in a column - * of the source matrix. For "traditional" packed matrices, this - * will be "1". - * @param src_col_stride The number of `T`s between columns in a row - * of the source matrix. For "traditional" packed matrices, this - * will be the leading dimension. - * @param dest The destination matrix, in column-major ordering. Must not - * overlap with the source matrix. - * @param dest_row_stride The number of `T`s between rows in a column - * of the destination matrix. For "traditional" packed matrices, - * this will be "1". - * @param dest_col_stride The number of `T`s between columns in a row - * of the destination matrix. For "traditional" packed matrices, + * @param[in] num_rows The number of rows in the matrix. + * @param[in] num_cols The number of columns in the matrix. + * @param[in] alpha The scaling factor. + * @param[in] src The source matrix, in column-major ordering. Must + * not overlap with the destination matrix. + * @param[in] src_row_stride The number of `T`s between rows in a + * column of the source matrix. For "traditional" packed + * matrices, this will be "1". + * @param[in] src_col_stride The number of `T`s between columns in a + * row of the source matrix. For "traditional" packed matrices, * this will be the leading dimension. - * @param stream The CUDA stream on which the kernel should be - * launched. + * @param[out] dest The destination matrix, in column-major + * ordering. Must not overlap with the source matrix. + * @param[in] dest_row_stride The number of `T`s between rows in a + * column of the destination matrix. For "traditional" packed + * matrices, this will be "1". + * @param[in] dest_col_stride The number of `T`s between columns in a + * row of the destination matrix. For "traditional" packed + * matrices, this will be the leading dimension. + * @param[in] sync_info The sync info wrapping the stream on which + * the kernel should be launched. */ template >> @@ -48,7 +52,7 @@ void Axpy_GPU_impl( SizeT num_rows, SizeT num_cols, T alpha, T const* src, SizeT src_row_stride, SizeT src_col_stride, T* dest, SizeT dest_row_stride, SizeT dest_col_stride, - cudaStream_t stream); + SyncInfo const& sync_info); template >, @@ -56,7 +60,7 @@ template const&) { throw std::logic_error("Axpy: Type not valid on GPU."); } @@ -80,8 +84,8 @@ void Axpy_GPU_impl( * @param[in,out] B The destination matrix, in column-major * ordering. Must not overlap with the source matrix. * @param[in] ldb The leading dimension of B. - * @param[in] stream The CUDA stream on which the kernel should be - * launched. + * @param[in] sync_info The sync info wrapping the stream on which + * the kernel should be launched. */ template >> @@ -89,7 +93,7 @@ void Axpy_GPU_impl( TransposeMode transpA, SizeT num_rows, SizeT num_cols, T alpha, T const* A, SizeT lda, T* B, SizeT ldb, - cudaStream_t stream); + SyncInfo const& sync_info); }// namespace hydrogen #endif // HYDROGEN_BLAS_GPU_AXPY_HPP_ diff --git a/include/hydrogen/blas/gpu/Copy.hpp b/include/hydrogen/blas/gpu/Copy.hpp index a3de39990f..638cc4d7b0 100644 --- a/include/hydrogen/blas/gpu/Copy.hpp +++ b/include/hydrogen/blas/gpu/Copy.hpp @@ -4,7 +4,11 @@ #include #include -#include +#ifdef HYDROGEN_HAVE_CUDA +#include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#endif #include @@ -20,16 +24,17 @@ namespace hydrogen * @tparam T (Inferred) The type of data. Must be the same for source * and destination matrices. * - * @param num_entries The number of entries in the array - * @param src The source array. Must not overlap with the destination - * array. - * @param src_stride The number of `T`s between entries in the source array. - * @param dest The destination array. Must not overlap with the + * @param[in] num_entries The number of entries in the array + * @param[in] src The source array. Must not overlap with the + * destination array. + * @param[in] src_stride The number of `T`s between entries in the + * source array. + * @param[out] dest The destination array. Must not overlap with the * source array. - * @param dest_stride The number of `T`s between entires in the + * @param[in] dest_stride The number of `T`s between entires in the * destination array. - * @param stream The CUDA stream on which the kernel should be - * launched. + * @param[in] sync_info The sync info wrapping the stream on which + * the kernel should be launched. * * @throws std::logic_error If the type is not supported on GPU or if * the arrays overlap. @@ -41,7 +46,7 @@ void Copy_GPU_impl( SizeT num_entries, SrcT const* src, SizeT src_stride, DestT* dest, SizeT dest_stride, - cudaStream_t stream); + SyncInfo const& sync_info); template >, @@ -51,7 +56,7 @@ void Copy_GPU_impl( SizeT const&, SrcT const* const&, SizeT const&, DestT* const&, SizeT const&, - cudaStream_t const&) + SyncInfo const&) { throw std::logic_error("Type not valid on GPU"); } @@ -64,26 +69,26 @@ void Copy_GPU_impl( * @tparam T (Inferred) The type of data. Must be the same for source * and destination matrices. * - * @param num_rows The number of rows in the matrix - * @param num_cols The number of columns in the matrix - * @param src The source matrix, in column-major ordering. Must not - * overlap with the destination matrix. - * @param src_row_stride The number of `T`s between rows in a column - * of the source matrix. For "traditional" packed matrices, this - * will be "1". - * @param src_col_stride The number of `T`s between columns in a row - * of the source matrix. For "traditional" packed matrices, this - * will be the leading dimension. - * @param dest The destination matrix, in column-major ordering. Must not - * overlap with the source matrix. - * @param dest_row_stride The number of `T`s between rows in a column - * of the destination matrix. For "traditional" packed matrices, - * this will be "1". - * @param dest_col_stride The number of `T`s between columns in a row - * of the destination matrix. For "traditional" packed matrices, + * @param[in] num_rows The number of rows in the matrix. + * @param[in] num_cols The number of columns in the matrix. + * @param[in] src The source matrix, in column-major ordering. Must + * not overlap with the destination matrix. + * @param[in] src_row_stride The number of `T`s between rows in a + * column of the source matrix. For "traditional" packed + * matrices, this will be "1". + * @param[in] src_col_stride The number of `T`s between columns in a + * row of the source matrix. For "traditional" packed matrices, * this will be the leading dimension. - * @param stream The CUDA stream on which the kernel should be - * launched. + * @param[out] dest The destination matrix, in column-major + * ordering. Must not overlap with the source matrix. + * @param[in] dest_row_stride The number of `T`s between rows in a + * column of the destination matrix. For "traditional" packed + * matrices, this will be "1". + * @param[in] dest_col_stride The number of `T`s between columns in a + * row of the destination matrix. For "traditional" packed + * matrices, this will be the leading dimension. + * @param[in] sync_info The sync info wrapping the stream on which + * the kernel should be launched. * * @todo See if we can statically assert that the operator= between * SrcT and DestT will succeed on the device. @@ -95,7 +100,7 @@ void Copy_GPU_impl( SizeT num_rows, SizeT num_cols, SrcT const* src, SizeT src_row_stride, SizeT src_col_stride, DestT* dest, SizeT dest_row_stride, SizeT dest_col_stride, - cudaStream_t stream); + SyncInfo const& sync_info); template >, @@ -104,7 +109,7 @@ template const&) { throw std::logic_error("Copy: Type not valid on GPU."); } diff --git a/include/hydrogen/blas/gpu/Fill.hpp b/include/hydrogen/blas/gpu/Fill.hpp index 350dfa364b..9fc5af5828 100644 --- a/include/hydrogen/blas/gpu/Fill.hpp +++ b/include/hydrogen/blas/gpu/Fill.hpp @@ -8,7 +8,11 @@ #include #include -#include +#ifdef HYDROGEN_HAVE_CUDA +#include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#endif #include @@ -18,17 +22,25 @@ namespace hydrogen template >> void Fill_GPU_impl(size_t height, size_t width, T const& alpha, T* buffer, size_t ldim, - cudaStream_t stream); + SyncInfo const& sync_info); template >, typename=void> void Fill_GPU_impl(size_t const&, size_t const&, T const&, T* const&, size_t const&, - cudaStream_t const&) + SyncInfo const&) { throw std::logic_error("Fill: Type not valid on GPU."); } +template >> +void Fill_GPU_1D_impl(T* buffer, size_t const& size, + T const& alpha, + SyncInfo const& sync_info) +{ + Fill_GPU_impl(size, 1, alpha, buffer, size, sync_info); +} + }// namespace hydrogen #endif // HYDROGEN_BLAS_GPU_FILL_HPP_ diff --git a/include/hydrogen/blas/gpu/Hadamard.hpp b/include/hydrogen/blas/gpu/Hadamard.hpp index 3639c1f010..8079ec544d 100644 --- a/include/hydrogen/blas/gpu/Hadamard.hpp +++ b/include/hydrogen/blas/gpu/Hadamard.hpp @@ -8,7 +8,11 @@ #include #include -#include +#ifdef HYDROGEN_HAVE_CUDA +#include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#endif #include @@ -21,7 +25,7 @@ void Hadamard_GPU_impl( T const* A, size_t row_stride_A, size_t lda, T const* B, size_t row_stride_B, size_t ldb, T* C, size_t row_stride_C, size_t ldc, - cudaStream_t stream); + SyncInfo const& sync_info); template >, @@ -31,7 +35,7 @@ void Hadamard_GPU_impl( T const* const&, size_t const&, size_t const&, T const* const&, size_t const&, size_t const&, T* const&, size_t const&, size_t const&, - cudaStream_t const&) + SyncInfo const&) { throw std::logic_error("Hadamard: Type not valid on GPU."); } diff --git a/include/hydrogen/blas/gpu/Scale.hpp b/include/hydrogen/blas/gpu/Scale.hpp index 9b1277c47b..e7d444f663 100644 --- a/include/hydrogen/blas/gpu/Scale.hpp +++ b/include/hydrogen/blas/gpu/Scale.hpp @@ -4,7 +4,11 @@ #include #include -#include +#ifdef HYDROGEN_HAVE_CUDA +#include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#endif #include @@ -25,8 +29,8 @@ namespace hydrogen * @param[in,out] buffer The array. * @param[in] stride The number of `T`s between entries in the input * array. - * @param[in] stream The CUDA stream on which the kernel should be - * launched. + * @param[in] sync_info The sync info wrapping the stream on which + * the kernel should be launched. * * @throws std::logic_error If the type is not supported on GPU. */ @@ -36,7 +40,7 @@ void Scale_GPU_impl( SizeT num_entries, T const& alpha, T* buffer, SizeT stride, - cudaStream_t stream); + SyncInfo const& sync_info); template >, @@ -45,7 +49,7 @@ void Scale_GPU_impl( SizeT const&, T const&, T const* const&, SizeT const&, - cudaStream_t const&) + SyncInfo const&) { throw std::logic_error("Scale: Type not valid on GPU"); } @@ -63,8 +67,8 @@ void Scale_GPU_impl( * @param[in] alpha The scaling parameter. * @param[in,out] buffer The matrix, in column-major ordering. * @param[in] ldim The leading dimension of the data in buffer. - * @param[in] stream The CUDA stream on which the kernel should be - * launched. + * @param[in] sync_info The sync info wrapping the stream on which + * the kernel should be launched. * * @todo See if we can statically assert that the operator*= will * succeed on the device. @@ -75,14 +79,14 @@ void Scale_GPU_impl( SizeT num_rows, SizeT num_cols, T const& alpha, T* buffer, SizeT ldim, - cudaStream_t stream); + SyncInfo const& sync_info); template >, typename=void> void Scale_GPU_impl(SizeT const&, SizeT const&, T const&, T const* const&, SizeT const&, - cudaStream_t const&) + SyncInfo const&) { throw std::logic_error("Scale: Type not valid on GPU."); } diff --git a/include/hydrogen/blas/gpu/Transpose.hpp b/include/hydrogen/blas/gpu/Transpose.hpp index 618cbd0539..d8f6e14958 100644 --- a/include/hydrogen/blas/gpu/Transpose.hpp +++ b/include/hydrogen/blas/gpu/Transpose.hpp @@ -4,7 +4,11 @@ #include #include -#include +#if defined(HYDROGEN_HAVE_CUDA) +#include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#endif #include @@ -28,8 +32,8 @@ namespace hydrogen * @param[out] dest The destination matrix, in column-major ordering. Must not * overlap with the source matrix. Contents will be overwritten. * @param[in] ldb The leading dimension of B. - * @param stream The CUDA stream on which the kernel should be - * launched. + * @param[in] sync_info The sync info wrapping the stream on which + * the kernel should be launched. */ template >> @@ -37,7 +41,7 @@ void Transpose_GPU_impl( SizeT num_rows, SizeT num_cols, T const* A, SizeT lda, T* B, SizeT ldb, - cudaStream_t stream); + SyncInfo const& sync_info); template >, @@ -46,7 +50,7 @@ void Transpose_GPU_impl( SizeT const&, SizeT const&, T const* const&, SizeT const&, T* const&, SizeT const&, - cudaStream_t const&) + SyncInfo const&) { throw std::logic_error("Copy: Type not valid on GPU."); } diff --git a/include/hydrogen/device/GPU.hpp b/include/hydrogen/device/GPU.hpp new file mode 100644 index 0000000000..ed23297ead --- /dev/null +++ b/include/hydrogen/device/GPU.hpp @@ -0,0 +1,163 @@ +#ifndef HYDROGEN_DEVICE_GPU_HPP_ +#define HYDROGEN_DEVICE_GPU_HPP_ + +/** @defgroup gpu_mgmt GPU device interaction and management + * + * These functions provide a runtime-agnostic API for basic + * interaction with GPUs. The exposed functionality is deliberately + * quite basic and represents the functions needed for Hydrogen. + */ + +#include + +#include +#include + +#include + +namespace hydrogen +{ + +/** @namespace gpu + * @brief Interface functions for interacting with the GPU. + * + * This is basically a "backended" system where the backends are + * mutually exclusive and therefore largely hidden from view. At time + * of writing, the backends are CUDA and ROCm/HIP. This will be + * determined at configure time based on user-input configure options + * and/or system interrogation. + * + * @note Since HIP is a compatibility layer, it should be + * possible to just universally use HIP. However, we wish to allow + * the two backends to evolve independently. Thus, it should + * theoretically be possible to just universally use the HIP + * backend. However, in the current implementation, CUDA-specific + * optimizations will be lost if compiling under HIP (as they will + * likely be protected by "HYDROGEN_HAVE_CUDA", which will not be + * defined in this case). + */ +namespace gpu +{ + +/** @name Environment management */ +///@{ + +/** @brief Initialize the GPU driver and runtime. + * + * This incorporates anything that needs to be done before kernels + * can be dispatched to the GPU. In CUDA terms, this establishes a + * CUDA context. + * + * @ingroup gpu_mgmt + */ +void Initialize(); + +/** @brief Cleanup and shutdown any GPU driver/runtime state. + * + * This performs any tasks that are required to close the GPU + * environment and leave a clean state. + * + * @ingroup gpu_mgmt + */ +void Finalize(); + +/** @brief Query if the GPU environment is initialized. + * @ingroup gpu_mgmt + */ +bool IsInitialized() noexcept; + +/** @brief Query if the GPU environment is finalized. + * + * Finalized means "not initialized", so an environment that has + * never been initialized is, in this sense, "finalized". + * + * @ingroup gpu_mgmt + */ +inline bool IsFinalized() noexcept { return !IsInitialized(); } + +///@} +/** @name Device management */ +///@{ + +/** @brief Get the number of GPUs visible to this process. + * @throws GPUError If the runtime detects any errors. + * @ingroup gpu_mgmt + */ +size_t DeviceCount(); + +/** @brief Get the ID of the currently selected GPU. + * @throws GPUError If the runtime detects any errors. + * @ingroup gpu_mgmt + */ +int CurrentDevice(); + +/** @brief Get the ID of the default GPU. + * @throws GPUError If the runtime detects any errors. + * @ingroup gpu_mgmt + */ +int DefaultDevice(); + +/** @brief Get the device ID we should be using. + * @details This uses environment variables set by most MPI libraries + * and/or launchers (slurm,lsf) to determine a device ID. Devices + * are assigned round-robin based on local rank. + * @param[in] device_count Number of visible devices. + * @ingroup gpu_mgmt + */ +int ComputeDeviceId(unsigned int device_count) noexcept; + +/** @brief Select the given device. + * + * @param[in] device_id The ID of the device to select. Must be less + * than the number of available GPUs. + * + * @throws GPUError If the runtime detects any errors. + * @ingroup gpu_mgmt + */ +void SetDevice(int device_id); + +/** @brief Block the host until all device execution has completed. + * @throws GPUError If the runtime detects any errors. + * @ingroup gpu_mgmt + */ +void SynchronizeDevice(); + +///@} +/** @name Execution control */ +///@{ + +/** @brief Get the default SyncInfo object for this session. + * + * Note that Hydrogen will use this SyncInfo by default. On CUDA + * platforms, for example, it will be different from the "default + * CUDA stream". + * + * This SyncInfo object will persist for as long as + * IsInitialized(). Note that if the GPU environment is finalized and + * reinitialized, this SyncInfo object in the new environment may + * differ from the previous environment. + * + * @throws GPUError If the runtime detects any errors. + * + * @ingroup gpu_mgmt + */ +SyncInfo const& DefaultSyncInfo() noexcept; + +///@} + +}// namespace gpu + +/** @name SyncInfo management */ +///@{ + +/** @brief Create a new CPU SyncInfo object. */ +template <> +SyncInfo CreateNewSyncInfo(); + +/** @brief Destroy the GPU SyncInfo. */ +void DestroySyncInfo(SyncInfo&); + +///@} + +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_HPP_ diff --git a/include/hydrogen/device/gpu/BasicCopy.hpp b/include/hydrogen/device/gpu/BasicCopy.hpp new file mode 100644 index 0000000000..59e37a5f2f --- /dev/null +++ b/include/hydrogen/device/gpu/BasicCopy.hpp @@ -0,0 +1,12 @@ +#ifndef HYDROGEN_DEVICE_GPU_BASICCOPY_HPP +#define HYDROGEN_DEVICE_GPU_BASICCOPY_HPP + +#include + +#if defined(HYDROGEN_HAVE_CUDA) +#include "cuda/CUDACopy.hpp" +#elif defined(HYDROGEN_HAVE_ROCM) +#include "rocm/ROCmCopy.hpp" +#endif + +#endif // HYDROGEN_DEVICE_GPU_BASICCOPY_HPP diff --git a/include/hydrogen/device/gpu/cuda/CUB.hpp b/include/hydrogen/device/gpu/CUB.hpp similarity index 78% rename from include/hydrogen/device/gpu/cuda/CUB.hpp rename to include/hydrogen/device/gpu/CUB.hpp index 0f94a4bfa1..75f10ad167 100644 --- a/include/hydrogen/device/gpu/cuda/CUB.hpp +++ b/include/hydrogen/device/gpu/CUB.hpp @@ -1,13 +1,24 @@ #ifndef HYDROGEN_IMPORTS_CUB_HPP_ #define HYDROGEN_IMPORTS_CUB_HPP_ +#include "El/hydrogen_config.h" + +#ifdef HYDROGEN_HAVE_CUDA #include #include +#elif defined HYDROGEN_HAVE_ROCM +#include +#endif // HYDROGEN_HAVE_CUB namespace hydrogen { namespace cub { +#ifdef HYDROGEN_HAVE_CUDA +namespace cub_impl = ::cub; +#elif defined HYDROGEN_HAVE_ROCM +namespace cub_impl = ::hipcub; +#endif // HYDROGEN_HAVE_CUDA /** @brief Get singleton instance of CUB memory pool. * @@ -27,7 +38,7 @@ namespace cub * redirect output on a per-rank basis, either through the * features exposed by their MPI launcher or by some other means. */ - ::cub::CachingDeviceAllocator& MemoryPool(); + cub_impl::CachingDeviceAllocator& MemoryPool(); /** Destroy singleton instance of CUB memory pool. */ void DestroyMemoryPool(); diff --git a/include/hydrogen/device/gpu/CUDA.hpp b/include/hydrogen/device/gpu/CUDA.hpp index af2fe77dd6..fe6acd216e 100644 --- a/include/hydrogen/device/gpu/CUDA.hpp +++ b/include/hydrogen/device/gpu/CUDA.hpp @@ -1,361 +1,8 @@ -#ifndef HYDROGEN_IMPORTS_CUDA_HPP_ -#define HYDROGEN_IMPORTS_CUDA_HPP_ +#ifndef HYDROGEN_DEVICE_GPU_CUDA_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDA_HPP_ -#include +#include "cuda/CUDAError.hpp" +#include "cuda/CUDALaunchKernel.hpp" +#include "cuda/CUDAManagement.hpp" -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -namespace hydrogen -{ - -/** @class CudaError - * - * Exception class for CUDA errors. - * - * \todo Clean up the error-handling macros - */ -struct CudaError : std::runtime_error -{ - std::string build_error_string_( - cudaError_t cuda_error, char const* file, int line, bool async = false) - { - std::ostringstream oss; - oss << ( async ? "Asynchronous CUDA error" : "CUDA error" ) - << " (error code=" << cuda_error << ") (" << file << ":" << line << "): " - << cudaGetErrorString(cuda_error); - return oss.str(); - } - CudaError(cudaError_t cuda_error, char const* file, int line, bool async = false) - : std::runtime_error{build_error_string_(cuda_error,file,line,async)} - {} -}; // struct CudaError - -#define H_CUDA_SYNC(async) \ - do \ - { \ - /* Synchronize GPU and check for errors. */ \ - cudaError_t status_CUDA_SYNC = cudaDeviceSynchronize(); \ - if (status_CUDA_SYNC == cudaSuccess) \ - status_CUDA_SYNC = cudaGetLastError(); \ - if (status_CUDA_SYNC != cudaSuccess) { \ - cudaDeviceReset(); \ - throw hydrogen::CudaError(status_CUDA_SYNC,__FILE__,__LINE__,async); \ - } \ - } \ - while( 0 ) -#define H_FORCE_CHECK_CUDA(cuda_call) \ - do \ - { \ - /* Call CUDA API routine, synchronizing before and after to */ \ - /* check for errors. */ \ - H_CUDA_SYNC(true); \ - cudaError_t status_CHECK_CUDA = cuda_call ; \ - if( status_CHECK_CUDA != cudaSuccess ) { \ - cudaDeviceReset(); \ - throw hydrogen::CudaError(status_CHECK_CUDA,__FILE__,__LINE__,false); \ - } \ - H_CUDA_SYNC(false); \ - } while (0) -#define H_FORCE_CHECK_CUDA_NOSYNC(cuda_call) \ - do \ - { \ - /* Call CUDA API routine, and check for errors without */ \ - /* synchronizing. */ \ - cudaError_t status_CHECK_CUDA = cuda_call ; \ - if( status_CHECK_CUDA != cudaSuccess ) { \ - cudaDeviceReset(); \ - throw hydrogen::CudaError(status_CHECK_CUDA,__FILE__,__LINE__,false); \ - } \ - } while (0) -#define H_LAUNCH_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args) \ - do \ - { \ - /* Dg is a dim3 specifying grid dimensions. */ \ - /* Db is a dim3 specifying block dimensions. */ \ - /* Ns is a size_t specifying dynamic memory. */ \ - /* S is a cudaStream_t specifying stream. */ \ - kernel <<< Dg, Db, Ns, S >>> args ; \ - } \ - while (0) -#define H_FORCE_CHECK_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args) \ - do \ - { \ - /* Launch CUDA kernel, synchronizing before */ \ - /* and after to check for errors. */ \ - H_CUDA_SYNC(true); \ - H_LAUNCH_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args); \ - H_CUDA_SYNC(false); \ - } \ - while (0) - -#ifdef HYDROGEN_RELEASE_BUILD -#define H_CHECK_CUDA( cuda_call ) H_FORCE_CHECK_CUDA_NOSYNC(cuda_call) -#define H_CHECK_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args) \ - H_LAUNCH_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args) -#else -#define H_CHECK_CUDA( cuda_call ) H_FORCE_CHECK_CUDA( cuda_call ) -#define H_CHECK_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args) \ - H_FORCE_CHECK_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args) -#endif // HYDROGEN_RELEASE_BUILD - -// Function to determine if a pointer is GPU memory -inline bool IsGPUMemory(const void* ptr) -{ - cudaPointerAttributes attrs; - auto err = cudaPointerGetAttributes(&attrs, ptr); - if (err == cudaErrorInvalidValue) - { - if ((err = cudaGetLastError()) == cudaErrorInvalidValue) - return false; - else - H_FORCE_CHECK_CUDA(err); - } - else - { -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - if ((err = cudaGetLastError()) == cudaSuccess) - return (attrs.memoryType == cudaMemoryTypeDevice); - else - H_FORCE_CHECK_CUDA(err); -#pragma GCC diagnostic pop - } - return false;// silence compiler warning -} - -/** Initialize CUDA environment. - * We assume that all MPI ranks within a compute node have access to - * exactly one unique GPU or to the same (possibly empty) list of - * GPUs. GPU assignments can be controled with the - * CUDA_VISIBLE_DEVICES environment variable. - */ -void InitializeCUDA(int,char*[]); -/** Finalize CUDA environment. */ -void FinalizeCUDA(); - -/** Singleton class to manage CUDA objects. - * This class also manages cuBLAS objects. Note that the CUDA device - * is set whenever the singleton instance is requested, i.e. in most - * of the static functions. - */ -class GPUManager -{ -public: - - GPUManager( const GPUManager& ) = delete; - GPUManager& operator=( const GPUManager& ) = delete; - ~GPUManager(); - - /** Create new singleton instance of CUDA manager. */ - static void Create( int device = 0 ); - /** Initilize CUBLAS. */ - static void InitializeCUBLAS(); - /** Destroy singleton instance of CUDA manager. */ - static void Destroy(); - /** Get singleton instance of CUDA manager. */ - static GPUManager* Instance(); - /** Get number of visible CUDA devices. */ - static unsigned int NumDevices(); - /** Get currently active CUDA device. */ - static int Device(); - /** Set active CUDA device. */ - static void SetDevice( int device ); - /** Get CUDA stream. */ - static cudaStream_t Stream(); - /** Get CUDA event. */ - static cudaEvent_t Event(); - /** Synchronize CUDA stream. */ - static void SynchronizeStream(); - /** Synchronize CUDA device. - * If checkError is true, an exception will be thrown if an error - * from an asynchronous CUDA kernel is detected. - */ - static void SynchronizeDevice( bool checkError = false ); - /** Get cuBLAS handle. */ - static cublasHandle_t cuBLASHandle(); - -private: - - /** Singleton instance. */ - static std::unique_ptr instance_; - - /** Number of visible CUDA devices. */ - unsigned int numDevices_; - /** Currently active CUDA device. */ - int device_; - /** CUDA stream. */ - cudaStream_t stream_; - /** CUDA event. */ - cudaEvent_t event_; - /** cuBLAS handle */ - cublasHandle_t cublasHandle_; - - GPUManager( int device = 0 ); - -}; // class GPUManager - -template -constexpr cudaMemcpyKind CUDAMemcpyKind(); - -template <> -constexpr cudaMemcpyKind CUDAMemcpyKind() -{ - return cudaMemcpyHostToDevice; -} - -template <> -constexpr cudaMemcpyKind CUDAMemcpyKind() -{ - return cudaMemcpyDeviceToHost; -} - -template <> -constexpr cudaMemcpyKind CUDAMemcpyKind() -{ - return cudaMemcpyDeviceToDevice; -} - -template <> -struct InterDeviceCopy -{ - template - static void MemCopy1DAsync( - T * __restrict__ const dest, - T const* __restrict__ const src, - size_t const size, - cudaStream_t stream = GPUManager::Stream()) - { - H_CHECK_CUDA( - cudaMemcpyAsync( - dest, src, size*sizeof(T), - CUDAMemcpyKind(), - stream)); - } - -#if defined(HYDROGEN_HAVE_HALF) && defined(HYDROGEN_GPU_USE_FP16) - // These two types are bitwise-compatible across the two devices. - static void MemCopy1DAsync(gpu_half_type * __restrict__ const dest, - cpu_half_type const* __restrict__ const src, - size_t const size, - cudaStream_t stream = GPUManager::Stream()) - { - H_CHECK_CUDA( - cudaMemcpyAsync( - dest, src, size*sizeof(gpu_half_type), - CUDAMemcpyKind(), - stream)); - } - - static void MemCopy1DAsync( - cpu_half_type * __restrict__ const dest, - gpu_half_type const* __restrict__ const src, - size_t const size, - cudaStream_t stream = GPUManager::Stream()) - { - H_CHECK_CUDA( - cudaMemcpyAsync( - dest, src, size*sizeof(gpu_half_type), - CUDAMemcpyKind(), - stream)); - } -#endif // defined(HYDROGEN_HAVE_HALF) && defined(HYDROGEN_GPU_USE_FP16) - - template - static void MemCopy2DAsync( - T * __restrict__ const dest, size_t const dest_ldim, - T const* __restrict__ const src, - size_t const src_ldim, - size_t const height, size_t const width, - cudaStream_t stream = GPUManager::Stream()) - { - H_CHECK_CUDA( - cudaMemcpy2DAsync( - dest, dest_ldim*sizeof(T), - src, src_ldim*sizeof(T), - height*sizeof(T), width, - CUDAMemcpyKind(), - stream)); - } - -#if defined(HYDROGEN_HAVE_HALF) && defined(HYDROGEN_GPU_USE_FP16) - // These two types are bitwise-compatible across the two devices. - static void MemCopy2DAsync( - gpu_half_type * __restrict__ const dest, - size_t const dest_ldim, - cpu_half_type const* __restrict__ const src, - size_t const src_ldim, - size_t const height, size_t const width, - cudaStream_t stream = GPUManager::Stream()) - { - H_CHECK_CUDA( - cudaMemcpy2DAsync( - dest, dest_ldim*sizeof(gpu_half_type), - src, src_ldim*sizeof(cpu_half_type), - height*sizeof(gpu_half_type), width, - CUDAMemcpyKind(), - stream)); - } - static void MemCopy2DAsync( - cpu_half_type * __restrict__ const dest, - size_t const dest_ldim, - gpu_half_type const* __restrict__ const src, - size_t const src_ldim, - size_t const height, size_t const width, - cudaStream_t stream = GPUManager::Stream()) - { - H_CHECK_CUDA(cudaMemcpy2DAsync( - dest, dest_ldim*sizeof(cpu_half_type), - src, src_ldim*sizeof(gpu_half_type), - height*sizeof(gpu_half_type), width, - CUDAMemcpyKind(), - stream)); - } -#endif // defined(HYDROGEN_HAVE_HALF) && defined(HYDROGEN_GPU_USE_FP16) -};// InterDevice - -template <> -struct InterDeviceCopy -{ - template - static void MemCopy1DAsync( - T * __restrict__ const dest, - T const* __restrict__ const src, size_t const size, - cudaStream_t stream = GPUManager::Stream()) - { - H_CHECK_CUDA( - cudaMemcpyAsync( - dest, src, size*sizeof(T), - CUDAMemcpyKind(), - stream)); - } - - template - static void MemCopy2DAsync( - T * __restrict__ const dest, size_t const dest_ldim, - T const* __restrict__ const src, size_t const src_ldim, - size_t const height, size_t const width, - cudaStream_t stream = GPUManager::Stream()) - { - H_CHECK_CUDA( - cudaMemcpy2DAsync( - dest, dest_ldim*sizeof(T), - src, src_ldim*sizeof(T), - height*sizeof(T), width, - CUDAMemcpyKind(), - stream)); - } -};// InterDevice - -} // namespace hydrogen - -#endif // HYDROGEN_IMPORTS_CUDA_HPP_ +#endif // HYDROGEN_DEVICE_GPU_CUDA_HPP_ diff --git a/include/hydrogen/device/gpu/GPUError.hpp b/include/hydrogen/device/gpu/GPUError.hpp new file mode 100644 index 0000000000..a2a1e6ca16 --- /dev/null +++ b/include/hydrogen/device/gpu/GPUError.hpp @@ -0,0 +1,18 @@ +#ifndef HYDROGEN_DEVICE_GPUERROR_HPP_ +#define HYDROGEN_DEVICE_GPUERROR_HPP_ + +#include + +#include + +namespace hydrogen +{ + +/** @name ErrorHandling */ +///@{ + +H_ADD_BASIC_EXCEPTION_CLASS(GPUError, std::runtime_error); + +///@} +}// namespace +#endif // HYDROGEN_DEVICE_GPUERROR_HPP_ diff --git a/include/hydrogen/device/gpu/ROCm.hpp b/include/hydrogen/device/gpu/ROCm.hpp new file mode 100644 index 0000000000..086dc8227f --- /dev/null +++ b/include/hydrogen/device/gpu/ROCm.hpp @@ -0,0 +1,8 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCM_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCM_HPP_ + +#include "rocm/ROCmError.hpp" +#include "rocm/ROCmLaunchKernel.hpp" +#include "rocm/ROCmManagement.hpp" + +#endif // HYDROGEN_DEVICE_GPU_ROCM_HPP_ diff --git a/include/hydrogen/device/gpu/SyncInfo.hpp b/include/hydrogen/device/gpu/SyncInfo.hpp new file mode 100644 index 0000000000..b685cd2b4e --- /dev/null +++ b/include/hydrogen/device/gpu/SyncInfo.hpp @@ -0,0 +1,12 @@ +#ifndef HYDROGEN_DEVICE_GPU_SYNCINFO_HPP_ +#define HYDROGEN_DEVICE_GPU_SYNCINFO_HPP_ + +#include + +#if defined HYDROGEN_HAVE_CUDA +#include "cuda/SyncInfo.hpp" +#elif defined HYDROGEN_HAVE_ROCM +#include "rocm/SyncInfo.hpp" +#endif + +#endif // HYDROGEN_DEVICE_GPU_SYNCINFO_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/CUDACopy.hpp b/include/hydrogen/device/gpu/cuda/CUDACopy.hpp new file mode 100644 index 0000000000..bcf97eb06d --- /dev/null +++ b/include/hydrogen/device/gpu/cuda/CUDACopy.hpp @@ -0,0 +1,112 @@ +#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUDACOPY_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDA_CUDACOPY_HPP_ + +#include + +#include +#include + +#include +#include + +#include "CUDAError.hpp" + +namespace hydrogen +{ +namespace gpu +{ + +/** @todo Flesh out documentation + * @todo these are actually only valid for "packed" types + */ + +// These functions are synchronous with respect to their SyncInfo +// objects (that is, they require explicit synchronization to the +// host). + +template +void Fill1DBuffer(T* buffer, size_t num_elements, T value, + SyncInfo const& si) +{ + Fill_GPU_1D_impl(buffer, num_elements, value, si); +} + +template +void Copy1DIntraDevice(T const* H_RESTRICT src, T* H_RESTRICT dest, + size_t num_elements, + SyncInfo const& si) +{ + H_CHECK_CUDA( + cudaMemcpyAsync( + dest, src, num_elements*sizeof(T), + cudaMemcpyDeviceToDevice, si.Stream())); +} + +template +void Copy1DToHost(T const* H_RESTRICT src, T* H_RESTRICT dest, + size_t num_elements, + SyncInfo const& src_si) +{ + H_CHECK_CUDA( + cudaMemcpyAsync( + dest, src, num_elements*sizeof(T), + cudaMemcpyDeviceToHost, src_si.Stream())); +} + +template +void Copy1DToDevice(T const* H_RESTRICT src, T* H_RESTRICT dest, + size_t num_elements, + SyncInfo const& dest_si) +{ + H_CHECK_CUDA( + cudaMemcpyAsync( + dest, src, num_elements*sizeof(T), + cudaMemcpyHostToDevice, dest_si.Stream())); +} + + +template +void Copy2DIntraDevice(T const* src, size_t src_ldim, + T* dest, size_t dest_ldim, + size_t height, size_t width, + SyncInfo const& si) +{ + H_CHECK_CUDA( + cudaMemcpy2DAsync( + dest, dest_ldim*sizeof(T), + src, src_ldim*sizeof(T), + height*sizeof(T), width, + cudaMemcpyDeviceToDevice, si.Stream())); +} + +template +void Copy2DToHost(T const* src, size_t src_ldim, + T* dest, size_t dest_ldim, + size_t height, size_t width, + SyncInfo const& src_si) +{ + H_CHECK_CUDA( + cudaMemcpy2DAsync( + dest, dest_ldim*sizeof(T), + src, src_ldim*sizeof(T), + height*sizeof(T), width, + cudaMemcpyDeviceToHost, src_si.Stream())); +} + +template +void Copy2DToDevice(T const* src, size_t src_ldim, + T* dest, size_t dest_ldim, + size_t height, size_t width, + SyncInfo const& dest_si) +{ + H_CHECK_CUDA( + cudaMemcpy2DAsync( + dest, dest_ldim*sizeof(T), + src, src_ldim*sizeof(T), + height*sizeof(T), width, + cudaMemcpyHostToDevice, dest_si.Stream())); +} + +}// namespace gpu +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_CUDA_CUDACOPY_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/CUDAError.hpp b/include/hydrogen/device/gpu/cuda/CUDAError.hpp new file mode 100644 index 0000000000..e2794e9caa --- /dev/null +++ b/include/hydrogen/device/gpu/cuda/CUDAError.hpp @@ -0,0 +1,52 @@ +#ifndef HYDROGEN_DEVICE_GPU_CUDAERROR_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDAERROR_HPP_ + +#include + +#include + +#include + +#ifdef HYDROGEN_GPU_CALLS_ARE_SYNCHRONOUS +#define H_SYNC_CUDA() cudaDeviceSynchronize() +#else +#define H_SYNC_CUDA() +#endif + +// Error handling macro +#define H_CHECK_CUDA(cmd) \ + do \ + { \ + H_SYNC_CUDA(); \ + auto h_check_cuda_error_code__ = cmd; \ + H_ASSERT(h_check_cuda_error_code__ == cudaSuccess, \ + ::hydrogen::CUDAError, \ + (cudaDeviceReset(), \ + ::hydrogen::cuda::BuildCUDAErrorMessage( \ + #cmd, h_check_cuda_error_code__))); \ + H_SYNC_CUDA(); \ + } while (false) + +namespace hydrogen +{ + +/** @class CUDAError + * @brief Exception class representing an error detected by the CUDA + * runtime. + */ +H_ADD_BASIC_EXCEPTION_CLASS(CUDAError, GPUError); + +namespace cuda +{ + +/** @brief Write an error message describing the error detected in CUDA. + * @param[in] cmd The expression that raised the error. + * @param[in] error_code The error code reported by CUDA. + * @returns A string describing the error. + */ +std::string BuildCUDAErrorMessage( + std::string const& cmd, cudaError_t error_code); + +}// namespace cuda +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_CUDAERROR_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/CUDALaunchKernel.hpp b/include/hydrogen/device/gpu/cuda/CUDALaunchKernel.hpp new file mode 100644 index 0000000000..331dc26a58 --- /dev/null +++ b/include/hydrogen/device/gpu/cuda/CUDALaunchKernel.hpp @@ -0,0 +1,31 @@ +#ifndef HYDROGEN_DEVICE_GPU_CUDALAUNCHKERNEL_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDALAUNCHKERNEL_HPP_ + +#include + +#include +#include + +#include "CUDAError.hpp" + +namespace hydrogen +{ +namespace gpu +{ + +template +void LaunchKernel( + F kernel, dim3 const& gridDim, dim3 const& blkDim, + size_t sharedMem, SyncInfo const& si, + Args... kernel_args) +{ + void* args[] = { const_cast(reinterpret_cast(&kernel_args))... }; + H_CHECK_CUDA( + cudaLaunchKernel( + (void const*) kernel, + gridDim, blkDim, args, sharedMem, si.Stream())); +} + +}// namespace gpu +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_CUDALAUNCHKERNEL_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/CUDAManagement.hpp b/include/hydrogen/device/gpu/cuda/CUDAManagement.hpp new file mode 100644 index 0000000000..e501ef79d7 --- /dev/null +++ b/include/hydrogen/device/gpu/cuda/CUDAManagement.hpp @@ -0,0 +1,22 @@ +#ifndef HYDROGEN_DEVICE_GPU_CUDAMANAGEMENT_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDAMANAGEMENT_HPP_ + +#include + +namespace hydrogen +{ + +using gpuEvent_t = cudaEvent_t; +using gpuStream_t = cudaStream_t; + +namespace cuda +{ +cudaEvent_t GetDefaultEvent() noexcept; +cudaStream_t GetDefaultStream() noexcept; +cudaEvent_t GetNewEvent(); +cudaStream_t GetNewStream(); +void FreeEvent(cudaEvent_t& event); +void FreeStream(cudaStream_t& stream); +}// namespace cuda +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_CUDAMANAGEMENT_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/SyncInfo.hpp b/include/hydrogen/device/gpu/cuda/SyncInfo.hpp new file mode 100644 index 0000000000..72eabbf012 --- /dev/null +++ b/include/hydrogen/device/gpu/cuda/SyncInfo.hpp @@ -0,0 +1,84 @@ +#ifndef HYDROGEN_DEVICE_GPU_CUDA_SYNCINFO_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDA_SYNCINFO_HPP_ + +#include + +#include +#include + +#include "CUDAError.hpp" +#include "CUDAManagement.hpp" + +namespace hydrogen +{ + +template <> +class SyncInfo +{ +public: + SyncInfo() + : SyncInfo{cuda::GetDefaultStream(), cuda::GetDefaultEvent()} + {} + + SyncInfo(cudaStream_t stream, cudaEvent_t event) + : stream_{stream}, event_{event} + {} + + void Merge(SyncInfo const& si) noexcept + { + if (si.stream_) + stream_ = si.stream_; + if (si.event_) + event_ = si.event_; + } + + cudaStream_t Stream() const noexcept { return stream_; } + cudaEvent_t Event() const noexcept { return event_; } +private: + friend void DestroySyncInfo(SyncInfo&); + cudaStream_t stream_; + cudaEvent_t event_; +};// struct SyncInfo + +inline void AddSynchronizationPoint(SyncInfo const& syncInfo) +{ + H_CHECK_CUDA(cudaEventRecord(syncInfo.Event(), syncInfo.Stream())); +} + + +namespace details +{ +inline void AddSyncPoint( + SyncInfo const& master, + SyncInfo const& dependent) +{ +} + +inline void AddSyncPoint( + SyncInfo const& master, + SyncInfo const& dependent) +{ + // The CPU must wait for the GPU to catch up. + Synchronize(master); // wait for "master" +} + +// This captures the work done on A and forces "others" to wait for +// completion. +template +inline +void AddSyncPoint( + SyncInfo const& master, SyncInfo const& other) +{ + if (master.Stream() != other.Stream()) + H_CHECK_CUDA( + cudaStreamWaitEvent(other.Stream(), master.Event(), 0)); +} +}// namespace details + +inline void Synchronize(SyncInfo const& syncInfo) +{ + H_CHECK_CUDA(cudaStreamSynchronize(syncInfo.Stream())); +} + +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_CUDA_SYNCINFO_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/cuBLAS.hpp b/include/hydrogen/device/gpu/cuda/cuBLAS.hpp index ead7b70f00..936f91b412 100644 --- a/include/hydrogen/device/gpu/cuda/cuBLAS.hpp +++ b/include/hydrogen/device/gpu/cuda/cuBLAS.hpp @@ -1,420 +1,12 @@ -#ifndef HYDROGEN_IMPORTS_CUBLAS_HPP_ -#define HYDROGEN_IMPORTS_CUBLAS_HPP_ +#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_HPP_ -#include +#include "cuBLASError.hpp" +#include "cuBLASManagement.hpp" +#include "cuBLASMeta.hpp" +#include "cuBLASUtil.hpp" -#include -#include -#include -#include -#include -#include +// The API wrapper declarations +#include "cuBLAS_API.hpp" -namespace hydrogen -{ - -#define ADD_ENUM_TO_STRING_CASE(enum_value) \ - case enum_value: \ - return #enum_value - -/** \class cuBLASError - * \brief Exception class for cuBLAS errors. - */ -struct cuBLASError : std::runtime_error -{ - static std::string get_error_string_(cublasStatus_t status) - { - switch (status) - { - ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_SUCCESS); - ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_NOT_INITIALIZED); - ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_ALLOC_FAILED); - ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_INVALID_VALUE); - ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_ARCH_MISMATCH); - ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_MAPPING_ERROR); - ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_EXECUTION_FAILED); - ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_INTERNAL_ERROR); - ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_NOT_SUPPORTED); - ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_LICENSE_ERROR); - default: - return "unknown cuBLAS error"; - } - } - - std::string build_error_string_( - cublasStatus_t status, char const* file, int line) - { - std::ostringstream oss; - oss << "cuBLAS error (" << file << ":" << line << "): " - << get_error_string_(status); - return oss.str(); - } - - cuBLASError(cublasStatus_t status, char const* file, int line) - : std::runtime_error{build_error_string_(status,file,line)} - {} -};// struct cublasError - -#undef ADD_ENUM_TO_STRING_CASE - -#define H_FORCE_CHECK_CUBLAS(cublas_call) \ - do \ - { \ - /* Check for earlier asynchronous errors. */ \ - H_FORCE_CHECK_CUDA(cudaSuccess); \ - { \ - /* Make cuBLAS call and check for errors. */ \ - const cublasStatus_t status_CHECK_CUBLAS = (cublas_call); \ - if (status_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) \ - { \ - cudaDeviceReset(); \ - throw cuBLASError(status_CHECK_CUBLAS,__FILE__,__LINE__); \ - } \ - } \ - { \ - /* Check for CUDA errors. */ \ - cudaError_t status_CHECK_CUBLAS = cudaDeviceSynchronize(); \ - if (status_CHECK_CUBLAS == cudaSuccess) \ - status_CHECK_CUBLAS = cudaGetLastError(); \ - if (status_CHECK_CUBLAS != cudaSuccess) \ - { \ - cudaDeviceReset(); \ - throw CudaError( \ - status_CHECK_CUBLAS,__FILE__,__LINE__,false); \ - } \ - } \ - } while (0) - -#define H_FORCE_CHECK_CUBLAS_NOSYNC(cublas_call) \ - do \ - { \ - /* Make cuBLAS call and check for errors without */ \ - /* synchronizing. */ \ - const cublasStatus_t status_CHECK_CUBLAS = (cublas_call); \ - if (status_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) \ - { \ - cudaDeviceReset(); \ - throw cuBLASError(status_CHECK_CUBLAS,__FILE__,__LINE__); \ - } \ - } while (0) - -#ifdef HYDROGEN_RELEASE_BUILD -#define H_CHECK_CUBLAS(cublas_call) \ - H_FORCE_CHECK_CUBLAS_NOSYNC(cublas_call) -#else -#define H_CHECK_CUBLAS(cublas_call) \ - H_FORCE_CHECK_CUBLAS(cublas_call) -#endif // #ifdef HYDROGEN_RELEASE_BUILD - -namespace cublas -{ - -/** @name cuBLAS utility functions. */ -///@{ - -/** @brief Initialize CUBLAS. - * - * This must be called after `MPI_Init` is called with - * MVAPICH2-GDR. Effectively, this creates the global cuBLAS library - * handle. - */ -void Initialize(); - -/** @class NativeType - * @brief Metafunction mapping type names to CUDA/cuBLAS equivalents. - * - * The mapping should provide bitwise equivalence. - * - * @note This belongs at this level because rocBLAS defines types (or - * names of types) that are local to the BLAS - * implementation. Additionally, it's feasible to conceive of - * custom types on the GPU that would, likewise, need to be - * mapped to the types that cuBLAS knows about. - * - * @todo Add static assertions to ensure only valid types get mapped. - */ -template -struct NativeTypeT; - -// Built-in types are their own native types -template <> struct NativeTypeT { using type = float; }; -template <> struct NativeTypeT { using type = double; }; -template <> -struct NativeTypeT { using type = cuComplex; }; -template <> -struct NativeTypeT { using type = cuDoubleComplex; }; - -// Complex and Double-Complex types require conversion -template <> -struct NativeTypeT> { using type = cuComplex; }; -template <> -struct NativeTypeT> { using type = cuDoubleComplex; }; - -// Half precision requires conversion as well -#ifdef HYDROGEN_GPU_USE_FP16 -template <> struct NativeTypeT<__half> { using type = __half; }; -#ifdef HYDROGEN_HAVE_HALF -template <> struct NativeTypeT { using type = __half; }; -#endif // HYDROGEN_HAVE_HALF -#endif // HYDROGEN_GPU_USE_FP16 - -/** @brief Convenience wrapper for NativeTypeT */ -template -using NativeType = typename NativeTypeT::type; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS -namespace meta_details -{ -template -auto Try_HasNativeType(int) -> SubstitutionSuccess>; -template -auto Try_HasNativeType(...) -> std::false_type; -}// namespace meta_details -#endif // DOXYGEN_SHOULD_SKIP_THIS - -/** @struct HasNativeType - * @brief Predicate that determines if a type is mappable to a - * library-native type. - */ -template -struct HasNativeType : decltype(meta_details::Try_HasNativeType(0)) {}; - -/** @class IsSupportedType_Base - * @brief Predicate indicating that a type is supported within cuBLAS - * for the given operation. - * - * This is used to map internal cuBLAS types to the operations that - * are supported. For example, `float` is always supported but - * `__half` only has support in a few functions. - */ -template -struct IsSupportedType_Base : std::false_type {}; - -template -struct IsSupportedType_Base : std::true_type {}; -template -struct IsSupportedType_Base : std::true_type {}; -template -struct IsSupportedType_Base : std::true_type {}; -template -struct IsSupportedType_Base : std::true_type {}; - -// No need to further test CUDA because this file isn't included if -// either we don't have GPUs at all or we don't have CUDA support. -#ifdef HYDROGEN_GPU_USE_FP16 -template <> -struct IsSupportedType_Base<__half, BLAS_Op::AXPY> : std::true_type {}; -template <> -struct IsSupportedType_Base<__half, BLAS_Op::GEMM> : std::true_type {}; -template <> -struct IsSupportedType_Base<__half, BLAS_Op::SCAL> : std::true_type {}; -#endif // HYDROGEN_GPU_USE_FP16 - -/** @class IsSupportedType - * @brief Predicate indicating that the given type is compatible with - * cuBLAS. - * - * This is true when either the type is a compatible cuBLAS type - * (e.g., float) or when it is binarily equivalent to one (e.g., - * std::complex).. - */ -template ::value> -struct IsSupportedType - : IsSupportedType_Base, op> -{}; - -template -struct IsSupportedType : std::false_type {}; - -/** @brief cuBLAS uses ints to represent sizes. */ -using SizeT = int; - -/** @brief Convert a value to the size type expected by the cuBLAS - * library. - * - * If `HYDROGEN_DO_BOUNDS_CHECKING` is defined, this will do a - * "safe cast" (it will verify that `val` is in the dynamic range of - * `int`. Otherwise it will do a regular static_cast. - */ -template -#ifdef HYDROGEN_DO_BOUNDS_CHECKING -SizeT ToSizeT(T const& val) -{ - return narrow_cast(val); -} -#else -SizeT ToSizeT(T const& val) noexcept -{ - return static_cast(val); -} -#endif // HYDROGEN_DO_BOUNDS_CHECKING - -/** @brief Overload to prevent extra work in the case of dynamic range checking. */ -inline SizeT ToSizeT(SizeT const& val) noexcept -{ - return val; -} - -/** @brief Convert an TransposeMode to the cuBLAS operation type. */ -inline cublasOperation_t -ToNativeTransposeMode(TransposeMode const& orient) noexcept -{ - switch (orient) - { - case TransposeMode::TRANSPOSE: - return CUBLAS_OP_T; - case TransposeMode::CONJ_TRANSPOSE: - return CUBLAS_OP_C; - default: // TransposeMode::NORMAL - return CUBLAS_OP_N; - } -} - -/** @brief Convert a SideMode to the cuBLAS side mode type. */ -inline cublasSideMode_t -ToNativeSideMode(SideMode const& side) noexcept -{ - if (side == SideMode::LEFT) - return CUBLAS_SIDE_LEFT; - - return CUBLAS_SIDE_RIGHT; -} - -/** @brief Get the cuBLAS library handle. */ -cublasHandle_t GetLibraryHandle() noexcept; - -/** @class SyncManager - * @brief Manage stream synchronization within cuBLAS. - */ -class SyncManager -{ -public: - SyncManager(cublasHandle_t handle, SyncInfo const& si); - ~SyncManager(); -private: - cudaStream_t orig_stream_; -};// class SyncManager - -///@} -/** @name BLAS-1 Routines */ -///@{ - -#define ADD_AXPY_DECL(ScalarType) \ - void Axpy(cublasHandle_t handle, \ - int n, ScalarType const& alpha, \ - ScalarType const* X, int incx, \ - ScalarType* Y, int incy) - -#define ADD_COPY_DECL(ScalarType) \ - void Copy(cublasHandle_t handle, \ - int n, ScalarType const* X, int incx, \ - ScalarType* Y, int incy) - -#define ADD_SCALE_DECL(ScalarType) \ - void Scale(cublasHandle_t handle, \ - int n, ScalarType const& alpha, \ - ScalarType* X, int incx) - -#ifdef HYDROGEN_GPU_USE_FP16 -ADD_AXPY_DECL(__half); -#endif // HYDROGEN_GPU_USE_FP16 -ADD_AXPY_DECL(float); -ADD_AXPY_DECL(double); -ADD_AXPY_DECL(cuComplex); -ADD_AXPY_DECL(cuDoubleComplex); - -ADD_COPY_DECL(float); -ADD_COPY_DECL(double); -ADD_COPY_DECL(cuComplex); -ADD_COPY_DECL(cuDoubleComplex); - -#ifdef HYDROGEN_GPU_USE_FP16 -ADD_SCALE_DECL(__half); -#endif // HYDROGEN_GPU_USE_FP16 -ADD_SCALE_DECL(float); -ADD_SCALE_DECL(double); -ADD_SCALE_DECL(cuComplex); -ADD_SCALE_DECL(cuDoubleComplex); - -///@} -/** @name BLAS-2 Routines */ -///@{ - -#define ADD_GEMV_DECL(ScalarType) \ - void Gemv( \ - cublasHandle_t handle, \ - cublasOperation_t transpA, int m, int n, \ - ScalarType const& alpha, \ - ScalarType const* A, int lda, \ - ScalarType const* x, int incx, \ - ScalarType const& beta, \ - ScalarType* y, int incy) - -ADD_GEMV_DECL(float); -ADD_GEMV_DECL(double); -ADD_GEMV_DECL(cuComplex); -ADD_GEMV_DECL(cuDoubleComplex); - -///@} -/** @name BLAS-3 Routines */ -///@{ - -#define ADD_GEMM_DECL(ScalarType) \ - void Gemm( \ - cublasHandle_t handle, \ - cublasOperation_t transpA, \ - cublasOperation_t transpB, \ - int m, int n, int k, \ - ScalarType const& alpha, \ - ScalarType const* A, int lda, \ - ScalarType const* B, int ldb, \ - ScalarType const& beta, \ - ScalarType* C, int ldc) - -#ifdef HYDROGEN_GPU_USE_FP16 -ADD_GEMM_DECL(__half); -#endif // HYDROGEN_GPU_USE_FP16 -ADD_GEMM_DECL(float); -ADD_GEMM_DECL(double); -ADD_GEMM_DECL(cuComplex); -ADD_GEMM_DECL(cuDoubleComplex); - -///@} -/** @name BLAS-like Extension Routines */ -///@{ - -// We use this for Axpy2D, Copy2D, and Transpose -#define ADD_GEAM_DECL(ScalarType) \ - void Geam(cublasHandle_t handle, \ - cublasOperation_t transpA, \ - cublasOperation_t transpB, \ - int m, int n, \ - ScalarType const& alpha, \ - ScalarType const* A, int lda, \ - ScalarType const& beta, \ - ScalarType const* B, int ldb, \ - ScalarType* C, int ldc) - -#define ADD_DGMM_DECL(ScalarType) \ - void Dgmm(cublasHandle_t handle, \ - cublasSideMode_t side, \ - int m, int n, \ - ScalarType const* A, int lda, \ - ScalarType const* X, int incx, \ - ScalarType* C, int ldc) - -ADD_GEAM_DECL(float); -ADD_GEAM_DECL(double); -ADD_GEAM_DECL(cuComplex); -ADD_GEAM_DECL(cuDoubleComplex); - -ADD_DGMM_DECL(float); -ADD_DGMM_DECL(double); -ADD_DGMM_DECL(cuComplex); -ADD_DGMM_DECL(cuDoubleComplex); - -///@} - -}// namespace cublas -}// namespace hydrogen -#endif // HYDROGEN_IMPORTS_CUBLAS_HPP_ +#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/cuBLASError.hpp b/include/hydrogen/device/gpu/cuda/cuBLASError.hpp new file mode 100644 index 0000000000..84c83c4d9b --- /dev/null +++ b/include/hydrogen/device/gpu/cuda/cuBLASError.hpp @@ -0,0 +1,48 @@ +#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLASERROR_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDA_CUBLASERROR_HPP_ + +#include + +#include +#include + +#include + +// Helper error-checking macro. +#define H_CHECK_CUBLAS(cmd) \ + do \ + { \ + H_SYNC_CUDA(); \ + auto h_check_cublas_err_code__ = cmd; \ + H_ASSERT(h_check_cublas_err_code__ == CUBLAS_STATUS_SUCCESS, \ + cuBLASError, \ + (cudaDeviceReset(), \ + cublas::BuildcuBLASErrorMessage( \ + #cmd, \ + h_check_cublas_err_code__))); \ + H_SYNC_CUDA(); \ + } while (false) + +namespace hydrogen +{ + +/** @class cuBLASError + * @brief Exception representing errors detected by cuBLAS library. + */ +H_ADD_BASIC_EXCEPTION_CLASS(cuBLASError,GPUError); + +namespace cublas +{ + +/** @brief Write an error message describing the error detected in CUDA. + * @param[in] cmd The expression that raised the error. + * @param[in] error_code The error code reported by CUDA. + * @returns A string describing the error. + */ +std::string BuildcuBLASErrorMessage( + std::string const& cmd, cublasStatus_t error_code); + +}// namespace cublas + +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLASERROR_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/cuBLASManagement.hpp b/include/hydrogen/device/gpu/cuda/cuBLASManagement.hpp new file mode 100644 index 0000000000..c9ce8b0caa --- /dev/null +++ b/include/hydrogen/device/gpu/cuda/cuBLASManagement.hpp @@ -0,0 +1,82 @@ +#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLASMANAGEMENT_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDA_CUBLASMANAGEMENT_HPP_ + +#include + +#include "cuBLASError.hpp" + +#include +#include + +#include + +namespace hydrogen +{ + +namespace cublas +{ + +/** @name cuBLAS management functions. */ +///@{ + +/** @brief Initialize cuBLAS. + * + * Creates the default library instance for cuBLAS. + * + * @note This must be called after `MPI_Init` is called with + * MVAPICH2-GDR. cuBLAS initialization allocates some device memory + * chunks, which MVAPICH-GDR attempts to intercept but fails if + * MPI_Init is not called yet. So, the correct ordering of + * initialization seems to be first CUDA, then MPI, and then any + * libraries that depend on CUDA or MPI. + * + * \param[in] handle The handle to use for cuBLAS. If null, a new + * handle will be created. If not null, it is + * assumed that the handle has been created with a + * user-side call to cublasCreate(). + */ +void Initialize(cublasHandle_t handle=nullptr); + +/** @brief Finalize the cuBLAS library. + * + * Destroys the default library handle. + * + * \throws cuBLASError If the cuBLAS library detects any errors. + */ +void Finalize(); + +/** @brief Replace the default cuBLAS library handle. + * + * This will destroy the current default cuBLAS library handle and + * assume control of the input handle. The cuBLAS library must be + * initialized in order to call this function. + * + * \param[in] handle The new library handle. Hydrogen will take + * ownership of the new handle and destroy it in + * Finalize(). + * + * \throws std::logic_error If the input handle is null or the + * library isn't initialized. + */ +void ReplaceLibraryHandle(cublasHandle_t handle); + +/** @brief Get the cuBLAS library handle. */ +cublasHandle_t GetLibraryHandle() noexcept; + +/** @class SyncManager + * @brief Manage stream synchronization within cuBLAS. + */ +class SyncManager +{ +public: + SyncManager(cublasHandle_t handle, SyncInfo const& si); + ~SyncManager(); +private: + cudaStream_t orig_stream_; +};// class SyncManager + +///@} + +}// namespace cublas +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLASMANAGEMENT_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/cuBLASMeta.hpp b/include/hydrogen/device/gpu/cuda/cuBLASMeta.hpp new file mode 100644 index 0000000000..2c30235c14 --- /dev/null +++ b/include/hydrogen/device/gpu/cuda/cuBLASMeta.hpp @@ -0,0 +1,133 @@ +#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLASMETA_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDA_CUBLASMETA_HPP_ + +#include + +#include +#include +#include + +#include + +namespace hydrogen +{ +namespace cublas +{ + +/** @class NativeTypeT + * @brief Metafunction mapping type names to CUDA/cuBLAS equivalents. + * + * The mapping should provide bitwise equivalence. + * + * @note This belongs at this level because rocBLAS defines types (or + * names of types) that are local to the BLAS + * implementation. Additionally, it's feasible to conceive of + * custom types on the GPU that would, likewise, need to be + * mapped to the types that cuBLAS knows about. + * + * @todo Add static assertions to ensure only valid types get mapped. + */ +template +struct NativeTypeT; + +// Built-in types are their own native types +template <> struct NativeTypeT { using type = float; }; +template <> struct NativeTypeT { using type = double; }; +template <> +struct NativeTypeT { using type = cuComplex; }; +template <> +struct NativeTypeT { using type = cuDoubleComplex; }; + +// Complex and Double-Complex types require conversion +template <> +struct NativeTypeT> { using type = cuComplex; }; +template <> +struct NativeTypeT> { using type = cuDoubleComplex; }; + +// Half precision requires conversion as well +#ifdef HYDROGEN_GPU_USE_FP16 +template <> struct NativeTypeT<__half> { using type = __half; }; +#ifdef HYDROGEN_HAVE_HALF +template <> struct NativeTypeT { using type = __half; }; +#endif // HYDROGEN_HAVE_HALF +#endif // HYDROGEN_GPU_USE_FP16 + +/** @brief Convenience wrapper for NativeTypeT */ +template +using NativeType = typename NativeTypeT::type; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +namespace meta_details +{ +template +auto Try_HasNativeType(int) -> SubstitutionSuccess>; +template +auto Try_HasNativeType(...) -> std::false_type; +}// namespace meta_details +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** @struct HasNativeType + * @brief Predicate that determines if a type is mappable to a + * library-native type. + */ +template +struct HasNativeType : decltype(meta_details::Try_HasNativeType(0)) {}; + +/** @class IsSupportedType_Base + * @brief Predicate indicating that a type is supported within cuBLAS + * for the given operation. + * + * This is used to map internal cuBLAS types to the operations that + * are supported. For example, `float` is always supported but + * `__half` only has support in a few functions. + */ +template +struct IsSupportedType_Base : std::false_type {}; + +template +struct IsSupportedType_Base : std::true_type {}; +template +struct IsSupportedType_Base : std::true_type {}; +template +struct IsSupportedType_Base : std::true_type {}; +template +struct IsSupportedType_Base : std::true_type {}; + +// No need to further test CUDA because this file isn't included if +// either we don't have GPUs at all or we don't have CUDA support. +#ifdef HYDROGEN_GPU_USE_FP16 +template <> +struct IsSupportedType_Base<__half, BLAS_Op::AXPY> : std::true_type {}; +template <> +struct IsSupportedType_Base<__half, BLAS_Op::DOT> : std::true_type {}; +template <> +struct IsSupportedType_Base<__half, BLAS_Op::GEMM> : std::true_type {}; +template <> +struct IsSupportedType_Base<__half, BLAS_Op::GEMMSTRIDEDBATCHED> + : std::true_type +{}; +template <> +struct IsSupportedType_Base<__half, BLAS_Op::NRM2> : std::true_type {}; +template <> +struct IsSupportedType_Base<__half, BLAS_Op::SCAL> : std::true_type {}; +#endif // HYDROGEN_GPU_USE_FP16 + +/** @class IsSupportedType + * @brief Predicate indicating that the given type is compatible with + * cuBLAS. + * + * This is true when either the type is a compatible cuBLAS type + * (e.g., float) or when it is binarily equivalent to one (e.g., + * std::complex).. + */ +template ::value> +struct IsSupportedType + : IsSupportedType_Base, op> +{}; + +template +struct IsSupportedType : std::false_type {}; + +}// namespace cublas +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLASMETA_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/cuBLASUtil.hpp b/include/hydrogen/device/gpu/cuda/cuBLASUtil.hpp new file mode 100644 index 0000000000..b7fe1db486 --- /dev/null +++ b/include/hydrogen/device/gpu/cuda/cuBLASUtil.hpp @@ -0,0 +1,71 @@ +#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLASUTIL_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDA_CUBLASUTIL_HPP_ + +#include + +#include + +namespace hydrogen +{ +namespace cublas +{ + +/** @brief cuBLAS uses ints to represent sizes. */ +using SizeT = int; + +/** @brief Convert a value to the size type expected by the cuBLAS + * library. + * + * If `HYDROGEN_DO_BOUNDS_CHECKING` is defined, this will do a + * "safe cast" (it will verify that `val` is in the dynamic range of + * `int`. Otherwise it will do a regular static_cast. + */ +template +#ifdef HYDROGEN_DO_BOUNDS_CHECKING +SizeT ToSizeT(T const& val) +{ + return narrow_cast(val); +} +#else +SizeT ToSizeT(T const& val) noexcept +{ + return static_cast(val); +} +#endif // HYDROGEN_DO_BOUNDS_CHECKING + +/** @brief Overload to prevent extra work in the case of dynamic range + * checking. + */ +inline SizeT ToSizeT(SizeT const& val) noexcept +{ + return val; +} + +/** @brief Convert an TransposeMode to the cuBLAS operation type. */ +inline cublasOperation_t +ToNativeTransposeMode(TransposeMode const& orient) noexcept +{ + switch (orient) + { + case TransposeMode::TRANSPOSE: + return CUBLAS_OP_T; + case TransposeMode::CONJ_TRANSPOSE: + return CUBLAS_OP_C; + default: // TransposeMode::NORMAL + return CUBLAS_OP_N; + } +} + +/** @brief Convert a SideMode to the cuBLAS side mode type. */ +inline cublasSideMode_t +ToNativeSideMode(SideMode const& side) noexcept +{ + if (side == SideMode::LEFT) + return CUBLAS_SIDE_LEFT; + + return CUBLAS_SIDE_RIGHT; +} + +}// namespace cublas +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLASUTIL_HPP_ diff --git a/include/hydrogen/device/gpu/cuda/cuBLAS_API.hpp b/include/hydrogen/device/gpu/cuda/cuBLAS_API.hpp new file mode 100644 index 0000000000..a9c0e886eb --- /dev/null +++ b/include/hydrogen/device/gpu/cuda/cuBLAS_API.hpp @@ -0,0 +1,186 @@ +#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_API_HPP_ +#define HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_API_HPP_ + +#include + +#include + +namespace hydrogen +{ +namespace cublas +{ + +/** @name BLAS-1 Routines */ +///@{ + +#define ADD_AXPY_DECL(ScalarType) \ + void Axpy(cublasHandle_t handle, \ + int n, ScalarType const& alpha, \ + ScalarType const* X, int incx, \ + ScalarType* Y, int incy) + +#define ADD_COPY_DECL(ScalarType) \ + void Copy(cublasHandle_t handle, \ + int n, ScalarType const* X, int incx, \ + ScalarType* Y, int incy) + +#define ADD_DOT_DECL(ScalarType) \ + void Dot(cublasHandle_t handle, \ + int n, \ + ScalarType const* X, int incx, \ + ScalarType const* Y, int incy, \ + ScalarType& output) + +#define ADD_NRM2_DECL(ScalarType) \ + void Nrm2(cublasHandle_t handle, \ + int n, \ + ScalarType const* X, int incx, \ + ScalarType& output) + +#define ADD_SCALE_DECL(ScalarType) \ + void Scale(cublasHandle_t handle, \ + int n, ScalarType const& alpha, \ + ScalarType* X, int incx) + +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_AXPY_DECL(__half); +#endif // HYDROGEN_GPU_USE_FP16 +ADD_AXPY_DECL(float); +ADD_AXPY_DECL(double); +ADD_AXPY_DECL(cuComplex); +ADD_AXPY_DECL(cuDoubleComplex); + +ADD_COPY_DECL(float); +ADD_COPY_DECL(double); +ADD_COPY_DECL(cuComplex); +ADD_COPY_DECL(cuDoubleComplex); + +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_DOT_DECL(__half); +#endif // HYDROGEN_GPU_USE_FP16 +ADD_DOT_DECL(float); +ADD_DOT_DECL(double); +ADD_DOT_DECL(cuComplex); +ADD_DOT_DECL(cuDoubleComplex); + +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_NRM2_DECL(__half); +#endif // HYDROGEN_GPU_USE_FP16 +ADD_NRM2_DECL(float); +ADD_NRM2_DECL(double); +ADD_NRM2_DECL(cuComplex); +ADD_NRM2_DECL(cuDoubleComplex); + +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_SCALE_DECL(__half); +#endif // HYDROGEN_GPU_USE_FP16 +ADD_SCALE_DECL(float); +ADD_SCALE_DECL(double); +ADD_SCALE_DECL(cuComplex); +ADD_SCALE_DECL(cuDoubleComplex); + +///@} +/** @name BLAS-2 Routines */ +///@{ + +#define ADD_GEMV_DECL(ScalarType) \ + void Gemv( \ + cublasHandle_t handle, \ + cublasOperation_t transpA, int m, int n, \ + ScalarType const& alpha, \ + ScalarType const* A, int lda, \ + ScalarType const* x, int incx, \ + ScalarType const& beta, \ + ScalarType* y, int incy) + +ADD_GEMV_DECL(float); +ADD_GEMV_DECL(double); +ADD_GEMV_DECL(cuComplex); +ADD_GEMV_DECL(cuDoubleComplex); + +///@} +/** @name BLAS-3 Routines */ +///@{ + +#define ADD_GEMM_DECL(ScalarType) \ + void Gemm( \ + cublasHandle_t handle, \ + cublasOperation_t transpA, \ + cublasOperation_t transpB, \ + int m, int n, int k, \ + ScalarType const& alpha, \ + ScalarType const* A, int lda, \ + ScalarType const* B, int ldb, \ + ScalarType const& beta, \ + ScalarType* C, int ldc) + +#define ADD_GEMM_STRIDED_BATCHED_DECL(ScalarType) \ + void GemmStridedBatched( \ + cublasHandle_t handle, \ + cublasOperation_t transpA, \ + cublasOperation_t transpB, \ + int m, int n, int k, \ + ScalarType const* alpha, \ + ScalarType const* A, int lda, \ + long long int strideA, \ + ScalarType const* B, int ldb, \ + long long int strideB, \ + ScalarType const* beta, \ + ScalarType* C, int ldc, \ + long long int strideC, \ + int batchCount) + +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_GEMM_DECL(__half); +#endif // HYDROGEN_GPU_USE_FP16 +ADD_GEMM_DECL(float); +ADD_GEMM_DECL(double); +ADD_GEMM_DECL(cuComplex); +ADD_GEMM_DECL(cuDoubleComplex); + +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_GEMM_STRIDED_BATCHED_DECL(__half); +#endif // HYDROGEN_GPU_USE_FP16 +ADD_GEMM_STRIDED_BATCHED_DECL(float); +ADD_GEMM_STRIDED_BATCHED_DECL(double); +ADD_GEMM_STRIDED_BATCHED_DECL(cuComplex); +ADD_GEMM_STRIDED_BATCHED_DECL(cuDoubleComplex); + +///@} +/** @name BLAS-like Extension Routines */ +///@{ + +// We use this for Axpy2D, Copy2D, and Transpose +#define ADD_GEAM_DECL(ScalarType) \ + void Geam(cublasHandle_t handle, \ + cublasOperation_t transpA, \ + cublasOperation_t transpB, \ + int m, int n, \ + ScalarType const& alpha, \ + ScalarType const* A, int lda, \ + ScalarType const& beta, \ + ScalarType const* B, int ldb, \ + ScalarType* C, int ldc) + +#define ADD_DGMM_DECL(ScalarType) \ + void Dgmm(cublasHandle_t handle, \ + cublasSideMode_t side, \ + int m, int n, \ + ScalarType const* A, int lda, \ + ScalarType const* X, int incx, \ + ScalarType* C, int ldc) + +ADD_GEAM_DECL(float); +ADD_GEAM_DECL(double); +ADD_GEAM_DECL(cuComplex); +ADD_GEAM_DECL(cuDoubleComplex); + +ADD_DGMM_DECL(float); +ADD_DGMM_DECL(double); +ADD_DGMM_DECL(cuComplex); +ADD_DGMM_DECL(cuDoubleComplex); + +///@} +}// namespace cublas +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_API_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/ROCmCopy.hpp b/include/hydrogen/device/gpu/rocm/ROCmCopy.hpp new file mode 100644 index 0000000000..61cd72131b --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/ROCmCopy.hpp @@ -0,0 +1,132 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCMCOPY_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCM_ROCMCOPY_HPP_ + +#include "ROCmError.hpp" + +#include +#include + +#include +#include + +#include + +namespace hydrogen +{ +namespace gpu +{ + +/** @todo Flesh out documentation + * @todo these are actually only valid for "packed" types + */ + +// These functions are synchronous with respect to their SyncInfo +// objects (that is, they require explicit synchronization to the +// host). + +template +void Fill1DBuffer(T* buffer, size_t num_elements, T value, + SyncInfo const& si) +{ + if (num_elements == 0UL) + return; + + Fill_GPU_1D_impl(buffer, num_elements, value, si); +} + +template +void Copy1DIntraDevice(T const* H_RESTRICT src, T* H_RESTRICT dest, + size_t num_elements, + SyncInfo const& si) +{ + if (num_elements == 0UL) + return; + + H_CHECK_HIP( + hipMemcpyAsync( + dest, src, num_elements*sizeof(T), + hipMemcpyDeviceToDevice, si.Stream())); +} + +template +void Copy1DToHost(T const* H_RESTRICT src, T* H_RESTRICT dest, + size_t num_elements, + SyncInfo const& src_si) +{ + if (num_elements == 0UL) + return; + + H_CHECK_HIP( + hipMemcpyAsync( + dest, src, num_elements*sizeof(T), + hipMemcpyDeviceToHost, src_si.Stream())); +} + +template +void Copy1DToDevice(T const* H_RESTRICT src, T* H_RESTRICT dest, + size_t num_elements, + SyncInfo const& dest_si) +{ + if (num_elements == 0UL) + return; + + H_CHECK_HIP( + hipMemcpyAsync( + dest, src, num_elements*sizeof(T), + hipMemcpyHostToDevice, dest_si.Stream())); +} + +template +void Copy2DIntraDevice(T const* src, size_t src_ldim, + T* dest, size_t dest_ldim, + size_t height, size_t width, + SyncInfo const& si) +{ + if (height == 0UL || width == 0UL) + return; + + H_CHECK_HIP( + hipMemcpy2DAsync( + dest, dest_ldim*sizeof(T), + src, src_ldim*sizeof(T), + height*sizeof(T), width, + hipMemcpyDeviceToDevice, si.Stream())); +} + +template +void Copy2DToHost(T const* src, size_t src_ldim, + T* dest, size_t dest_ldim, + size_t height, size_t width, + SyncInfo const& src_si) +{ + if (height == 0UL || width == 0UL) + return; + + H_CHECK_HIP( + hipMemcpy2DAsync( + dest, dest_ldim*sizeof(T), + src, src_ldim*sizeof(T), + height*sizeof(T), width, + hipMemcpyDeviceToHost, src_si.Stream())); +} + +template +void Copy2DToDevice(T const* src, size_t src_ldim, + T* dest, size_t dest_ldim, + size_t height, size_t width, + SyncInfo const& dest_si) +{ + if (height == 0UL || width == 0UL) + return; + + H_CHECK_HIP( + hipMemcpy2DAsync( + dest, dest_ldim*sizeof(T), + src, src_ldim*sizeof(T), + height*sizeof(T), width, + hipMemcpyHostToDevice, dest_si.Stream())); +} + +}// namespace gpu +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCMCOPY_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/ROCmError.hpp b/include/hydrogen/device/gpu/rocm/ROCmError.hpp new file mode 100644 index 0000000000..58c89849df --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/ROCmError.hpp @@ -0,0 +1,51 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCMERROR_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCMERROR_HPP_ + +#include + +#include + +#include + +#ifdef HYDROGEN_GPU_CALLS_ARE_SYNCHRONOUS +#define H_SYNC_HIP() hipDeviceSynchronize() +#else +#define H_SYNC_HIP() +#endif + +// Error handling macro +#define H_CHECK_HIP(cmd) \ + do \ + { \ + H_SYNC_HIP(); \ + auto h_check_hip_error_code__ = cmd; \ + H_ASSERT(h_check_hip_error_code__ == hipSuccess, \ + ::hydrogen::HIPError, \ + (hipDeviceReset(), \ + ::hydrogen::rocm::BuildHipErrorMessage( \ + #cmd, h_check_hip_error_code__))); \ + H_SYNC_HIP(); \ + } while (false) + +namespace hydrogen +{ + +/** @class HipError + * @brief Exception class describing an error in the HIP environment + */ +H_ADD_BASIC_EXCEPTION_CLASS(HIPError, GPUError); + +namespace rocm +{ + +/** @brief Write an error message describing the error detected in HIP. + * @param[in] cmd The expression that raised the error. + * @param[in] hipError_T The error code reported by HIP. + * @return A string describing the error. + */ +std::string BuildHipErrorMessage( + std::string const& cmd, hipError_t error_code); + +}// namespace rocm +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_ROCMERROR_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/ROCmLaunchKernel.hpp b/include/hydrogen/device/gpu/rocm/ROCmLaunchKernel.hpp new file mode 100644 index 0000000000..19c2cfcd70 --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/ROCmLaunchKernel.hpp @@ -0,0 +1,30 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCMLAUNCHKERNEL_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCMLAUNCHKERNEL_HPP_ + +#include +#include + +namespace hydrogen +{ +namespace gpu +{ + +template +void LaunchKernel( + F kernel, dim3 const& gridDim, dim3 const& blkDim, + size_t sharedMem, SyncInfo const& si, + Args&&... kernel_args) +{ + H_CHECK_HIP(hipGetLastError()); + // Note that this is (currently) implemented as a macro; not clear + // if std::forward-ing the arguments is appropriate... + hipLaunchKernelGGL( + kernel, gridDim, blkDim, + sharedMem, si.Stream(), + std::forward(kernel_args)...); + H_CHECK_HIP(hipGetLastError()); +} + +}// namespace gpu +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_ROCMLAUNCHKERNEL_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/ROCmManagement.hpp b/include/hydrogen/device/gpu/rocm/ROCmManagement.hpp new file mode 100644 index 0000000000..58f7ab3d10 --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/ROCmManagement.hpp @@ -0,0 +1,22 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCMMANAGEMENT_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCMMANAGEMENT_HPP_ + +#include + +namespace hydrogen +{ + +using gpuEvent_t = hipEvent_t; +using gpuStream_t = hipStream_t; + +namespace rocm +{ +hipEvent_t GetDefaultEvent() noexcept; +hipStream_t GetDefaultStream() noexcept; +hipEvent_t GetNewEvent(); +hipStream_t GetNewStream(); +void FreeEvent(hipEvent_t& event); +void FreeStream(hipStream_t& stream); +}// namespace rocm +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_ROCMMANAGEMENT_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/SyncInfo.hpp b/include/hydrogen/device/gpu/rocm/SyncInfo.hpp new file mode 100644 index 0000000000..ee7d372b4c --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/SyncInfo.hpp @@ -0,0 +1,83 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCM_SYNCINFO_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCM_SYNCINFO_HPP_ + +#include + +#include +#include + +#include "ROCmError.hpp" +#include "ROCmManagement.hpp" + +namespace hydrogen +{ + +template <> +class SyncInfo +{ +public: + SyncInfo() + : SyncInfo{rocm::GetDefaultStream(), rocm::GetDefaultEvent()} + {} + + SyncInfo(hipStream_t stream, hipEvent_t event) + : stream_{stream}, event_{event} + {} + + void Merge(SyncInfo const& si) noexcept + { + if (si.stream_) + stream_ = si.stream_; + if (si.event_) + event_ = si.event_; + } + + hipStream_t Stream() const noexcept { return stream_; } + hipEvent_t Event() const noexcept { return event_; } +private: + friend void DestroySyncInfo(SyncInfo&); + hipStream_t stream_; + hipEvent_t event_; +};// struct SyncInfo + +inline void AddSynchronizationPoint(SyncInfo const& syncInfo) +{ + H_CHECK_HIP(hipEventRecord(syncInfo.Event(), syncInfo.Stream())); +} + +namespace details +{ +inline void AddSyncPoint( + SyncInfo const& master, + SyncInfo const& dependent) +{ +} + +inline void AddSyncPoint( + SyncInfo const& master, + SyncInfo const& dependent) +{ + // The CPU must wait for the GPU to catch up. + Synchronize(master); // wait for "master" +} + +// This captures the work done on A and forces "others" to wait for +// completion. +template +inline +void AddSyncPoint( + SyncInfo const& master, SyncInfo const& other) +{ + if (master.Stream() != other.Stream()) + H_CHECK_HIP( + hipStreamWaitEvent(other.Stream(), master.Event(), 0)); +} +}// namespace details + +inline void Synchronize(SyncInfo const& syncInfo) +{ + H_CHECK_HIP(hipStreamSynchronize(syncInfo.Stream())); +} + +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_ROCM_SYNCINFO_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/rocBLAS.hpp b/include/hydrogen/device/gpu/rocm/rocBLAS.hpp new file mode 100644 index 0000000000..15a10dc547 --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/rocBLAS.hpp @@ -0,0 +1,12 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_HPP_ + +#include "rocBLASError.hpp" +#include "rocBLASManagement.hpp" +#include "rocBLASMeta.hpp" +#include "rocBLASUtil.hpp" + +// The API wrapper declarations +#include "rocBLAS_API.hpp" + +#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/rocBLASError.hpp b/include/hydrogen/device/gpu/rocm/rocBLASError.hpp new file mode 100644 index 0000000000..d5958c716c --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/rocBLASError.hpp @@ -0,0 +1,48 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLASERROR_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLASERROR_HPP_ + +#include + +#include +#include + +#include + +// Helper error-checking macro. +#define H_CHECK_ROCBLAS(cmd) \ + do \ + { \ + H_SYNC_HIP(); \ + auto h_check_rocblas_err_code__ = cmd; \ + H_ASSERT(h_check_rocblas_err_code__ == rocblas_status_success, \ + rocBLASError, \ + (hipDeviceReset(), \ + rocblas::BuildrocBLASErrorMessage( \ + #cmd, \ + h_check_rocblas_err_code__))); \ + H_SYNC_HIP(); \ + } while (false) + +namespace hydrogen +{ + +/** @class rocBLASError + * @brief Exception representing errors detected by rocBLAS library. + */ +H_ADD_BASIC_EXCEPTION_CLASS(rocBLASError,GPUError); + +namespace rocblas +{ + +/** @brief Write an error message describing the error detected in rocBLAS. + * @param[in] cmd The expression that raised the error. + * @param[in] error_code The error code reported by rocBLAS. + * @returns A string describing the error. + */ +std::string BuildrocBLASErrorMessage( + std::string const& cmd, rocblas_status error_code); + +}// namespace rocblas + +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLASERROR_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp b/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp new file mode 100644 index 0000000000..137c58dbca --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp @@ -0,0 +1,74 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMANAGEMENT_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMANAGEMENT_HPP_ + +#include "rocBLASError.hpp" + +#include +#include + +#include +#include + +namespace hydrogen +{ + +namespace rocblas +{ + +/** @name rocBLAS management functions. */ +///@{ + +/** @brief Initialize rocBLAS. + * + * Creates the default library instance for rocBLAS. + * + * \param[in] handle The handle to use for rocBLAS. If null, a new + * handle will be created. If not null, it is + * assumed that the handle has been created with a + * user-side call to rocblas_create_handle(). + */ +void Initialize(rocblas_handle handle=nullptr); + +/** @brief Finalize the rocBLAS library. + * + * Destroys the default library handle. + * + * \throws rocBLASError If the rocBLAS library detects any errors. + */ +void Finalize(); + +/** @brief Replace the default rocBLAS library handle. + * + * This will destroy the current default rocBLAS library handle and + * assume control of the input handle. The rocBLAS library must be + * initialized in order to call this function. + * + * \param[in] handle The new library handle. Hydrogen will take + * ownership of the new handle and destroy it in + * Finalize(). + * + * \throws std::logic_error If the input handle is null or the + * library isn't initialized. + */ +void ReplaceLibraryHandle(rocblas_handle handle); + +/** @brief Get the rocBLAS library handle. */ +rocblas_handle GetLibraryHandle() noexcept; + +/** @class SyncManager + * @brief Manage stream synchronization within rocBLAS. + */ +class SyncManager +{ +public: + SyncManager(rocblas_handle handle, SyncInfo const& si); + ~SyncManager(); +private: + hipStream_t orig_stream_; +};// class SyncManager + +///@} + +}// namespace rocblas +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMANAGEMENT_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp b/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp new file mode 100644 index 0000000000..d130850d83 --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp @@ -0,0 +1,121 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMETA_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMETA_HPP_ + +#include + +#include +#include +#include + +#include + +namespace hydrogen +{ +namespace rocblas +{ + +/** @class NativeTypeT + * @brief Metafunction mapping type names to HIP/rocBLAS equivalents. + * + * The mapping should provide bitwise equivalence. + * + * @note This belongs at this level because rocBLAS defines types (or + * names of types) that are local to the BLAS + * implementation. Additionally, it's feasible to conceive of + * custom types on the GPU that would, likewise, need to be + * mapped to the types that rocBLAS knows about. + * + * @todo Add static assertions to ensure only valid types get mapped. + */ +template +struct NativeTypeT +{ + using type = T; +}; + +// Complex and Double-Complex types require conversion +template <> +struct NativeTypeT> { using type = rocblas_float_complex; }; +template <> +struct NativeTypeT> { using type = rocblas_double_complex; }; + +// Half precision requires conversion as well +#ifdef HYDROGEN_GPU_USE_FP16 +template <> struct NativeTypeT { using type = rocblas_half; }; +#ifdef HYDROGEN_HAVE_HALF +template <> struct NativeTypeT { using type = rocblas_half; }; +template <> +struct NativeTypeT> +{ + using type = rocblas_half_complex; +}; +#endif // HYDROGEN_HAVE_HALF +#endif // HYDROGEN_GPU_USE_FP16 + +/** @brief Convenience wrapper for NativeTypeT */ +template +using NativeType = typename NativeTypeT::type; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +namespace meta_details +{ +template +auto Try_HasNativeType(int) -> SubstitutionSuccess>; +template +auto Try_HasNativeType(...) -> std::false_type; +}// namespace meta_details +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** @struct HasNativeType + * @brief Predicate that determines if a type is mappable to a + * library-native type. + */ +template +struct HasNativeType : decltype(meta_details::Try_HasNativeType(0)) {}; + +/** @class IsSupportedType_Base + * @brief Predicate indicating that a type is supported within rocBLAS + * for the given operation. + * + * This is used to map internal rocBLAS types to the operations that + * are supported. For example, `float` is always supported but + * `rocblas_half` only has support in a few functions. + */ +template +struct IsSupportedType_Base : std::false_type {}; + +template +struct IsSupportedType_Base : std::true_type {}; +template +struct IsSupportedType_Base : std::true_type {}; +template <> +struct IsSupportedType_Base : std::false_type {}; +template <> +struct IsSupportedType_Base : std::false_type {}; + +#ifdef HYDROGEN_GPU_USE_FP16 +template <> +struct IsSupportedType_Base : std::true_type {}; +template <> +struct IsSupportedType_Base : std::true_type {}; +#endif // HYDROGEN_GPU_USE_FP16 + +/** @class IsSupportedType + * @brief Predicate indicating that the given type is compatible with + * rocBLAS. + * + * This is true when either the type is a compatible rocBLAS type + * (e.g., float) or when it is binarily equivalent to one (e.g., + * std::complex).. + */ +template ::value> +struct IsSupportedType + : IsSupportedType_Base, op> +{}; + +template +struct IsSupportedType : std::false_type {}; + +}// namespace rocblas +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMETA_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp b/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp new file mode 100644 index 0000000000..ee47d51aec --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp @@ -0,0 +1,71 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLASUTIL_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLASUTIL_HPP_ + +#include + +#include + +namespace hydrogen +{ +namespace rocblas +{ + +/** @brief rocBLAS uses its own int typedef to represent sizes. */ +using SizeT = rocblas_int; + +/** @brief Convert a value to the size type expected by the rocBLAS + * library. + * + * If `HYDROGEN_DO_BOUNDS_CHECKING` is defined, this will do a + * "safe cast" (it will verify that `val` is in the dynamic range of + * `int`. Otherwise it will do a regular static_cast. + */ +template +#ifdef HYDROGEN_DO_BOUNDS_CHECKING +SizeT ToSizeT(T const& val) +{ + return narrow_cast(val); +} +#else +SizeT ToSizeT(T const& val) noexcept +{ + return static_cast(val); +} +#endif // HYDROGEN_DO_BOUNDS_CHECKING + +/** @brief Overload to prevent extra work in the case of dynamic range + * checking. + */ +inline SizeT ToSizeT(SizeT const& val) noexcept +{ + return val; +} + +/** @brief Convert an TransposeMode to the rocBLAS operation type. */ +inline rocblas_operation +ToNativeTransposeMode(TransposeMode const& orient) noexcept +{ + switch (orient) + { + case TransposeMode::TRANSPOSE: + return rocblas_operation_transpose; + case TransposeMode::CONJ_TRANSPOSE: + return rocblas_operation_conjugate_transpose; + default: // TransposeMode::NORMAL + return rocblas_operation_none; + } +} + +/** @brief Convert a SideMode to the rocBLAS side mode type. */ +inline rocblas_side +ToNativeSideMode(SideMode const& side) noexcept +{ + if (side == SideMode::LEFT) + return rocblas_side_left; + + return rocblas_side_right; +} + +}// namespace rocblas +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLASUTIL_HPP_ diff --git a/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp b/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp new file mode 100644 index 0000000000..0ffe469ba4 --- /dev/null +++ b/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp @@ -0,0 +1,131 @@ +#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_API_HPP_ +#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_API_HPP_ + +#include + +#include + +namespace hydrogen +{ +namespace rocblas +{ + +/** @name BLAS-1 Routines */ +///@{ + +#define ADD_AXPY_DECL(ScalarType) \ + void Axpy(rocblas_handle handle, \ + int n, ScalarType const& alpha, \ + ScalarType const* X, int incx, \ + ScalarType* Y, int incy) + +#define ADD_COPY_DECL(ScalarType) \ + void Copy(rocblas_handle handle, \ + int n, ScalarType const* X, int incx, \ + ScalarType* Y, int incy) + +#define ADD_DOT_DECL(ScalarType) \ + void Dot(rocblasHandle_t handle, \ + int n, \ + ScalarType const* X, int incx, \ + ScalarType const* Y, int incy, \ + ScalarType* output) + +#define ADD_NRM2_DECL(ScalarType) \ + void Nrm2(rocblasHandle_t handle, \ + int n, \ + ScalarType const* X, int incx, \ + ScalarType* output) + +#define ADD_SCALE_DECL(ScalarType) \ + void Scale(rocblas_handle handle, \ + int n, ScalarType const& alpha, \ + ScalarType* X, int incx) + +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_AXPY_DECL(rocblas_half); +#endif // HYDROGEN_GPU_USE_FP16 +ADD_AXPY_DECL(float); +ADD_AXPY_DECL(double); + +ADD_COPY_DECL(float); +ADD_COPY_DECL(double); + +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_SCALE_DECL(rocblas_half); +#endif // HYDROGEN_GPU_USE_FP16 +ADD_SCALE_DECL(float); +ADD_SCALE_DECL(double); + +///@} +/** @name BLAS-2 Routines */ +///@{ + +#define ADD_GEMV_DECL(ScalarType) \ + void Gemv( \ + rocblas_handle handle, \ + rocblas_operation transpA, int m, int n, \ + ScalarType const& alpha, \ + ScalarType const* A, int lda, \ + ScalarType const* x, int incx, \ + ScalarType const& beta, \ + ScalarType* y, int incy) + +ADD_GEMV_DECL(float); +ADD_GEMV_DECL(double); + +///@} +/** @name BLAS-3 Routines */ +///@{ + +#define ADD_GEMM_DECL(ScalarType) \ + void Gemm( \ + rocblas_handle handle, \ + rocblas_operation transpA, \ + rocblas_operation transpB, \ + int m, int n, int k, \ + ScalarType const& alpha, \ + ScalarType const* A, int lda, \ + ScalarType const* B, int ldb, \ + ScalarType const& beta, \ + ScalarType* C, int ldc) + +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_GEMM_DECL(rocblas_half); +#endif // HYDROGEN_GPU_USE_FP16 +ADD_GEMM_DECL(float); +ADD_GEMM_DECL(double); + +///@} +/** @name BLAS-like Extension Routines */ +///@{ + +// We use this for Axpy2D, Copy2D, and Transpose +#define ADD_GEAM_DECL(ScalarType) \ + void Geam(rocblas_handle handle, \ + rocblas_operation transpA, \ + rocblas_operation transpB, \ + int m, int n, \ + ScalarType const& alpha, \ + ScalarType const* A, int lda, \ + ScalarType const& beta, \ + ScalarType const* B, int ldb, \ + ScalarType* C, int ldc) + +ADD_GEAM_DECL(float); +ADD_GEAM_DECL(double); + +#define ADD_DGMM_DECL(ScalarType) \ + void Dgmm(rocblas_handle handle, \ + rocblas_side side, \ + int m, int n, \ + ScalarType const* A, int lda, \ + ScalarType const* X, int incx, \ + ScalarType* C, int ldc) +ADD_DGMM_DECL(float); +ADD_DGMM_DECL(double); + +///@} +}// namespace rocblas +}// namespace hydrogen +#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_API_HPP_ diff --git a/include/hydrogen/meta/MetaUtilities.hpp b/include/hydrogen/meta/MetaUtilities.hpp index 89a8de91c9..86d3d3402d 100644 --- a/include/hydrogen/meta/MetaUtilities.hpp +++ b/include/hydrogen/meta/MetaUtilities.hpp @@ -112,6 +112,10 @@ using MakePointer = typename std::add_pointer::type; template using MakePointerToConst = MakePointer>; +/** @brief Convenience type predicate to check if two types are the same. */ +template +using IsSame = std::is_same; + // Wrapper around std::conditional template using Select = typename std::conditional::type; @@ -127,6 +131,5 @@ template struct EnumSame : std::true_type {}; ///@} - }// namespace hydrogen #endif // HYDROGEN_META_METAUTILITIES_HPP_ diff --git a/include/hydrogen/utils/SimpleBuffer.hpp b/include/hydrogen/utils/SimpleBuffer.hpp index 697476bb07..c210094b42 100644 --- a/include/hydrogen/utils/SimpleBuffer.hpp +++ b/include/hydrogen/utils/SimpleBuffer.hpp @@ -5,9 +5,10 @@ #include #include -#ifdef HYDROGEN_HAVE_CUDA -#include -#endif // HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU +#include +#include +#endif // HYDROGEN_HAVE_GPU #include @@ -66,28 +67,15 @@ void setBufferToValue(T* buffer, size_t size, T const& value, std::fill_n(buffer, size, value); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void setBufferToValue(T* buffer, size_t size, T const& value, - SyncInfo syncInfo = SyncInfo{}) + SyncInfo const& syncInfo) { - if( value == TypeTraits::Zero() ) - { - H_CHECK_CUDA(cudaMemsetAsync(buffer, 0x0, size*sizeof(T), - syncInfo.stream_)); - } - else - { - std::vector tmp(size, value); - H_CHECK_CUDA( - cudaMemcpyAsync( - buffer, tmp.data(), size*sizeof(T), - CUDAMemcpyKind(), - syncInfo.stream_)); - } + gpu::Fill1DBuffer(buffer, size, value, syncInfo); AddSynchronizationPoint(syncInfo); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU }// namespace details diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7412589367..03cadd3e4f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -14,8 +14,8 @@ add_subdirectory(hydrogen) # Propagate the files up the tree set(HYDROGEN_SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE) -if (HYDROGEN_HAVE_CUDA) - set(HYDROGEN_CUDA_SOURCES "${CUDA_SOURCES}" PARENT_SCOPE) +if (HYDROGEN_HAVE_GPU) + set(HYDROGEN_GPU_SOURCES "${GPU_SOURCES}" PARENT_SCOPE) endif () set(HYDROGEN_CATCH2_TEST_FILES "${CATCH2_TESTS}" "${THIS_DIR_CATCH2_TESTS}" PARENT_SCOPE) diff --git a/src/blas_like/level2/Gemv.cpp b/src/blas_like/level2/Gemv.cpp index 5556885af2..47c37bd0db 100644 --- a/src/blas_like/level2/Gemv.cpp +++ b/src/blas_like/level2/Gemv.cpp @@ -33,7 +33,7 @@ void Gemv(Orientation orientA, beta, static_cast&>(C)); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Gemv(orientA, alpha, static_cast const&>(A), @@ -41,7 +41,7 @@ void Gemv(Orientation orientA, beta, static_cast&>(C)); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Bad device type."); } @@ -309,7 +309,7 @@ void Gemv Gemv(orientation, alpha, A, x, T(0), y); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void Gemv(Orientation orientA, float alpha, Matrix const& A, @@ -335,7 +335,7 @@ template void Gemv(Orientation, gpu_half_type, gpu_half_type, AbstractMatrix&); #endif // HYDROGEN_GPU_USE_FP16 -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define PROTO(T) \ template void Gemv \ diff --git a/src/blas_like/level2/Gemv/Normal.hpp b/src/blas_like/level2/Gemv/Normal.hpp index c111a7d67e..8d315e3623 100644 --- a/src/blas_like/level2/Gemv/Normal.hpp +++ b/src/blas_like/level2/Gemv/Normal.hpp @@ -131,11 +131,11 @@ void Normal case Device::CPU: Normal_impl(alpha, APre, x, beta, yPre); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Normal_impl(alpha, APre, x, beta, yPre); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Gemv::Normal: Bad device."); } diff --git a/src/blas_like/level2/Gemv/Transpose.hpp b/src/blas_like/level2/Gemv/Transpose.hpp index cf67564de4..4d365428f2 100644 --- a/src/blas_like/level2/Gemv/Transpose.hpp +++ b/src/blas_like/level2/Gemv/Transpose.hpp @@ -148,11 +148,11 @@ void Transpose case Device::CPU: Transpose_impl(orientation, alpha, APre, x, beta, yPre); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Transpose_impl(orientation, alpha, APre, x, beta, yPre); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Gemv::Transpose: Bad device."); } diff --git a/src/blas_like/level3/Gemm.cpp b/src/blas_like/level3/Gemm.cpp index 3198cd1b15..16d338f2e9 100644 --- a/src/blas_like/level3/Gemm.cpp +++ b/src/blas_like/level3/Gemm.cpp @@ -72,7 +72,7 @@ InitializeComms(El::Grid const& g, InitGrid(g, syncInfo); InitGrid(g, syncInfo); } - H_CHECK_CUDA(cudaDeviceSynchronize()); + hydrogen::gpu::SynchronizeDevice(); initialized_grids_.push_front(&g); } return pool; @@ -114,7 +114,7 @@ void Gemm(Orientation orientA, Orientation orientB, beta, static_cast&>(C)); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: Gemm(orientA, orientB, alpha, static_cast const&>(A), @@ -122,7 +122,7 @@ void Gemm(Orientation orientA, Orientation orientB, beta, static_cast&>(C)); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Bad device type."); } @@ -159,7 +159,7 @@ void Gemm_impl( beta, C.Buffer(), C.LDim()); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void Gemm_impl( Orientation orientA, Orientation orientB, @@ -185,7 +185,7 @@ void Gemm_impl( beta, C.Buffer(), C.LDim(), master_sync); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU }// namespace @@ -439,7 +439,7 @@ void LocalGemm LocalGemm(orientA, orientB, alpha, A, B, TypeTraits::Zero(), C); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void Gemm(Orientation orientA, Orientation orientB, float alpha, Matrix const& A, @@ -460,7 +460,7 @@ template void Gemm(Orientation orientA, Orientation orientB, gpu_half_type beta, Matrix& C); #endif // HYDROGEN_GPU_USE_FP16 -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define ABSTRACT_PROTO(T) \ template void Gemm( \ diff --git a/src/blas_like/level3/Gemm/TN.hpp b/src/blas_like/level3/Gemm/TN.hpp index 4d2525f6a4..50486f448c 100644 --- a/src/blas_like/level3/Gemm/TN.hpp +++ b/src/blas_like/level3/Gemm/TN.hpp @@ -87,11 +87,11 @@ void SUMMA_TNA case Device::CPU: SUMMA_TNA_impl(orientA, alpha, APre, BPre, CPre); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: SUMMA_TNA_impl(orientA, alpha, APre, BPre, CPre); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("SUMMA_TNA: Bad device."); } @@ -202,11 +202,11 @@ void SUMMA_TNB case Device::CPU: SUMMA_TNB_impl(orientA, alpha, APre, BPre, CPre); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: SUMMA_TNB_impl(orientA, alpha, APre, BPre, CPre); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("SUMMA_TNA: Bad device."); } @@ -317,11 +317,11 @@ void SUMMA_TNC case Device::CPU: SUMMA_TNC_impl(orientA, alpha, APre, BPre, CPre); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: SUMMA_TNC_impl(orientA, alpha, APre, BPre, CPre); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("SUMMA_TNA: Bad device."); } @@ -444,12 +444,12 @@ void SUMMA_TNDot SUMMA_TNDot_impl( orientA, alpha, APre, BPre, CPre, blockSize); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: SUMMA_TNDot_impl( orientA, alpha, APre, BPre, CPre, blockSize); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("SUMMA_TNA: Bad device."); } diff --git a/src/blas_like/level3/Gemm/TT.hpp b/src/blas_like/level3/Gemm/TT.hpp index b2078ce03d..1955b0111a 100644 --- a/src/blas_like/level3/Gemm/TT.hpp +++ b/src/blas_like/level3/Gemm/TT.hpp @@ -89,12 +89,12 @@ void SUMMA_TTA SUMMA_TTA_impl( orientA, orientB, alpha, APre, BPre, CPre); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: SUMMA_TTA_impl( orientA, orientB, alpha, APre, BPre, CPre); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("SUMMA_TTA: Bad device."); } @@ -179,12 +179,12 @@ void SUMMA_TTB SUMMA_TTB_impl( orientA, orientB, alpha, APre, BPre, CPre); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: SUMMA_TTB_impl( orientA, orientB, alpha, APre, BPre, CPre); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("SUMMA_TTB: Bad device."); } @@ -267,12 +267,12 @@ void SUMMA_TTC SUMMA_TTC_impl( orientA, orientB, alpha, APre, BPre, CPre); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: SUMMA_TTC_impl( orientA, orientB, alpha, APre, BPre, CPre); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("SUMMA_TTC: Bad device."); } @@ -364,12 +364,12 @@ void SUMMA_TTDot SUMMA_TTDot_impl( orientA, orientB, alpha, APre, BPre, CPre, blockSize); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: SUMMA_TTDot_impl( orientA, orientB, alpha, APre, BPre, CPre, blockSize); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("SUMMA_TTA: Bad device."); } diff --git a/src/blas_like/level3/SyncInfoPool.hpp b/src/blas_like/level3/SyncInfoPool.hpp index 30b9226db7..f54cedfb96 100644 --- a/src/blas_like/level3/SyncInfoPool.hpp +++ b/src/blas_like/level3/SyncInfoPool.hpp @@ -145,17 +145,21 @@ void swap(SyncInfoPool& a, SyncInfoPool& b) noexcept template <> SyncInfoPool::~SyncInfoPool() { +#ifdef HYDROGEN_HAVE_CUDA + using GPUErrorType = CUDAError; +#elif defined(HYDROGEN_HAVE_ROCM) + using GPUErrorType = HIPError; +#endif try { for (auto& si : pool_) { - H_CHECK_CUDA(cudaEventDestroy(si.event_)); - H_CHECK_CUDA(cudaStreamDestroy(si.stream_)); + DestroySyncInfo(si); } } - catch (CudaError const& e) + catch (GPUErrorType const& e) { - std::cerr << "Warning: CUDA error detected:\n\ne.what(): " + std::cerr << "Warning: GPU runtime error detected:\n\ne.what(): " << e.what() << std::endl; } @@ -176,21 +180,14 @@ void SyncInfoPool::EnsureSize(size_t pool_size) size_t const new_elements = pool_size - start_size; for (auto ii = 0UL; ii < new_elements; ++ii) { - cudaStream_t stream; - cudaEvent_t event; - H_CHECK_CUDA( - cudaStreamCreateWithFlags( - &stream, cudaStreamNonBlocking)); - H_CHECK_CUDA( - cudaEventCreateWithFlags( - &event, cudaEventDisableTiming)); + auto si = CreateNewSyncInfo(); #ifdef HYDROGEN_HAVE_NVPROF // Name the stream for debugging purposes std::string const stream_name = "H: SP (" + std::to_string(start_size + ii) + ")"; - nvtxNameCudaStreamA(stream, stream_name.c_str()); + nvtxNameCudaStreamA(si.Stream(), stream_name.c_str()); #endif // HYDROGEN_HAVE_NVPROF - pool_.emplace_back(stream, event); + pool_.emplace_back(std::move(si)); } // Handle iterators: diff --git a/src/blas_like/level3/sync_info_pool_test.cpp b/src/blas_like/level3/sync_info_pool_test.cpp index 44f56e2221..48c69ff210 100644 --- a/src/blas_like/level3/sync_info_pool_test.cpp +++ b/src/blas_like/level3/sync_info_pool_test.cpp @@ -65,8 +65,8 @@ TEST_CASE( for (auto const& si : tmp) { auto const& pool_si = pool.Next(); - CHECK(si.stream_ == pool_si.stream_); - CHECK(si.event_ == pool_si.event_); + CHECK(si.Stream() == pool_si.Stream()); + CHECK(si.Event() == pool_si.Event()); } SECTION("Moving the pool preserves iterators") @@ -79,8 +79,8 @@ TEST_CASE( auto const& pool_si = pool_mv.Next(); auto const& si = tmp[1]; - CHECK(si.stream_ == pool_si.stream_); - CHECK(si.event_ == pool_si.event_); + CHECK(si.Stream() == pool_si.Stream()); + CHECK(si.Event() == pool_si.Event()); } SECTION("Growing the pool preserves iterators") @@ -95,8 +95,8 @@ TEST_CASE( auto const& pool_si = pool.Next(); auto const& si = tmp[1]; - CHECK(si.stream_ == pool_si.stream_); - CHECK(si.event_ == pool_si.event_); + CHECK(si.Stream() == pool_si.Stream()); + CHECK(si.Event() == pool_si.Event()); } } SECTION("Resetting the pool returns to the same point") @@ -104,8 +104,8 @@ TEST_CASE( auto const& first = pool.Next(); pool.Reset(); auto const& after_reset = pool.Next(); - CHECK(first.event_ == after_reset.event_); - CHECK(first.stream_ == after_reset.stream_); + CHECK(first.Event() == after_reset.Event()); + CHECK(first.Stream() == after_reset.Stream()); } } diff --git a/src/core/DistMatrix/AbstractDistMatrix.cpp b/src/core/DistMatrix/AbstractDistMatrix.cpp index 91e3a2ca3c..d1f492a687 100644 --- a/src/core/DistMatrix/AbstractDistMatrix.cpp +++ b/src/core/DistMatrix/AbstractDistMatrix.cpp @@ -124,7 +124,7 @@ AbstractDistMatrix::MakeSizeConsistent(bool includingViewers) if (this->GetLocalDevice() == Device::CPU) mpi::Broadcast(message, msgSize, Root(), CrossComm(), SyncInfo{}); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU else if (this->GetLocalDevice() == Device::GPU) { auto syncInfo = SyncInfoFromMatrix( @@ -133,7 +133,7 @@ AbstractDistMatrix::MakeSizeConsistent(bool includingViewers) mpi::Broadcast(message, msgSize, Root(), CrossComm(), syncInfo); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU else LogicError("AbstractMatrix: Bad Device!"); } @@ -143,7 +143,7 @@ AbstractDistMatrix::MakeSizeConsistent(bool includingViewers) if (this->GetLocalDevice() == Device::CPU) mpi::Broadcast(message, msgSize, vcRoot, grid.ViewingComm(), SyncInfo{}); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU else if (this->GetLocalDevice() == Device::GPU) { auto syncInfo = SyncInfoFromMatrix( @@ -152,7 +152,7 @@ AbstractDistMatrix::MakeSizeConsistent(bool includingViewers) mpi::Broadcast(message, msgSize, vcRoot, grid.ViewingComm(), syncInfo); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU else LogicError("AbstractMatrix: Bad Device!"); diff --git a/src/core/DistMatrix/ElementMatrix.cpp b/src/core/DistMatrix/ElementMatrix.cpp index 8523ce825d..eb6e0f421d 100644 --- a/src/core/DistMatrix/ElementMatrix.cpp +++ b/src/core/DistMatrix/ElementMatrix.cpp @@ -114,7 +114,7 @@ ElementalMatrix::MakeConsistent(bool includingViewers) if (this->GetLocalDevice() == Device::CPU) mpi::Broadcast(message, msgLength, this->Root(), this->CrossComm(), SyncInfo{}); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU else if (this->GetLocalDevice() == Device::GPU) { auto syncInfo = SyncInfoFromMatrix( @@ -123,7 +123,7 @@ ElementalMatrix::MakeConsistent(bool includingViewers) mpi::Broadcast(message, msgLength, this->Root(), this->CrossComm(), syncInfo); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU else LogicError("ElementalMatrix: Bad Device!"); } @@ -133,7 +133,7 @@ ElementalMatrix::MakeConsistent(bool includingViewers) if (this->GetLocalDevice() == Device::CPU) mpi::Broadcast(message, msgLength, vcRoot, grid.ViewingComm(), SyncInfo{}); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU else if (this->GetLocalDevice() == Device::GPU) { auto syncInfo = SyncInfoFromMatrix( @@ -142,7 +142,7 @@ ElementalMatrix::MakeConsistent(bool includingViewers) mpi::Broadcast(message, msgLength, vcRoot, grid.ViewingComm(), syncInfo); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU else LogicError("ElementalMatrix: Bad Device!"); } @@ -395,7 +395,7 @@ ElementalMatrix::Attach( static_cast&>(this->Matrix()). Attach_(localHeight, localWidth, buffer, ldim); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case El::Device::GPU: static_cast&>(this->Matrix()). Attach_(localHeight, localWidth, buffer, ldim); @@ -459,7 +459,7 @@ ElementalMatrix::LockedAttach( static_cast&>(this->Matrix()). LockedAttach_(localHeight, localWidth, buffer, ldim); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case El::Device::GPU: static_cast&>(this->Matrix()). LockedAttach_(localHeight, localWidth, buffer, ldim); diff --git a/src/core/DistMatrix/ElementMatrix/CIRC_CIRC.cpp b/src/core/DistMatrix/ElementMatrix/CIRC_CIRC.cpp index dbbd10aa61..2e036a11d5 100644 --- a/src/core/DistMatrix/ElementMatrix/CIRC_CIRC.cpp +++ b/src/core/DistMatrix/ElementMatrix/CIRC_CIRC.cpp @@ -144,7 +144,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR); \ BOTH(T,VR, STAR); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU // Inter-device copy ctors #ifdef HYDROGEN_GPU_USE_FP16 template @@ -247,7 +247,7 @@ template DistMatrix::DistMatrix( template DistMatrix& DistMatrix::operator=( const DistMatrix&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/MC_MR.cpp b/src/core/DistMatrix/ElementMatrix/MC_MR.cpp index 6e1b8cb7b2..0b737081fa 100644 --- a/src/core/DistMatrix/ElementMatrix/MC_MR.cpp +++ b/src/core/DistMatrix/ElementMatrix/MC_MR.cpp @@ -278,7 +278,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR,Device::CPU); \ BOTH(T,VR, STAR,Device::CPU); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU // Inter-device copy ctors #ifdef HYDROGEN_GPU_USE_FP16 template DistMatrix::DistMatrix( @@ -375,7 +375,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/MC_STAR.cpp b/src/core/DistMatrix/ElementMatrix/MC_STAR.cpp index 51568a0be8..332cd7bb12 100644 --- a/src/core/DistMatrix/ElementMatrix/MC_STAR.cpp +++ b/src/core/DistMatrix/ElementMatrix/MC_STAR.cpp @@ -301,7 +301,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR,Device::CPU); \ BOTH(T,VR, STAR,Device::CPU); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define INSTGPU(T,U,V) \ template DistMatrix::DistMatrix \ (DistMatrix const&); \ @@ -389,7 +389,7 @@ template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/MD_STAR.cpp b/src/core/DistMatrix/ElementMatrix/MD_STAR.cpp index 1d0b923125..a1acaf2b4e 100644 --- a/src/core/DistMatrix/ElementMatrix/MD_STAR.cpp +++ b/src/core/DistMatrix/ElementMatrix/MD_STAR.cpp @@ -271,7 +271,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR); \ BOTH(T,VR, STAR); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define INSTGPU(T,U,V) \ template DistMatrix::DistMatrix \ (DistMatrix const&); \ @@ -352,7 +352,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/MR_MC.cpp b/src/core/DistMatrix/ElementMatrix/MR_MC.cpp index e21d04b5de..8b0e4a0435 100644 --- a/src/core/DistMatrix/ElementMatrix/MR_MC.cpp +++ b/src/core/DistMatrix/ElementMatrix/MR_MC.cpp @@ -272,7 +272,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR,Device::CPU); \ BOTH(T,VR, STAR,Device::CPU); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU // Inter-device copy ctors template DistMatrix::DistMatrix( const DistMatrix&); @@ -365,7 +365,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/MR_STAR.cpp b/src/core/DistMatrix/ElementMatrix/MR_STAR.cpp index 8076016559..7299173c83 100644 --- a/src/core/DistMatrix/ElementMatrix/MR_STAR.cpp +++ b/src/core/DistMatrix/ElementMatrix/MR_STAR.cpp @@ -300,7 +300,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR); \ BOTH(T,VR, STAR); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU // Inter-device copy ctors #ifdef HYDROGEN_GPU_USE_FP16 template DistMatrix::DistMatrix( @@ -397,7 +397,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/STAR_MC.cpp b/src/core/DistMatrix/ElementMatrix/STAR_MC.cpp index 41d21af8ac..d4a2d2b3bb 100644 --- a/src/core/DistMatrix/ElementMatrix/STAR_MC.cpp +++ b/src/core/DistMatrix/ElementMatrix/STAR_MC.cpp @@ -295,7 +295,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR); \ BOTH(T,VR, STAR); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define INSTGPU(T,U,V) \ template DistMatrix::DistMatrix \ (DistMatrix const&); \ @@ -376,7 +376,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/STAR_MD.cpp b/src/core/DistMatrix/ElementMatrix/STAR_MD.cpp index e3e634dcd8..de90940033 100644 --- a/src/core/DistMatrix/ElementMatrix/STAR_MD.cpp +++ b/src/core/DistMatrix/ElementMatrix/STAR_MD.cpp @@ -271,7 +271,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR); \ BOTH(T,VR, STAR); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define INSTGPU(T,U,V) \ template DistMatrix::DistMatrix \ (DistMatrix const&); \ @@ -356,7 +356,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/STAR_MR.cpp b/src/core/DistMatrix/ElementMatrix/STAR_MR.cpp index fa739c9d83..59af515b4e 100644 --- a/src/core/DistMatrix/ElementMatrix/STAR_MR.cpp +++ b/src/core/DistMatrix/ElementMatrix/STAR_MR.cpp @@ -306,7 +306,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR); \ BOTH(T,VR, STAR); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define INSTGPU(T,U,V) \ template DistMatrix::DistMatrix \ (DistMatrix const&); \ @@ -391,7 +391,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/STAR_STAR.cpp b/src/core/DistMatrix/ElementMatrix/STAR_STAR.cpp index 2a0bbe62c4..5ffbf20245 100644 --- a/src/core/DistMatrix/ElementMatrix/STAR_STAR.cpp +++ b/src/core/DistMatrix/ElementMatrix/STAR_STAR.cpp @@ -256,7 +256,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR); \ BOTH(T,VR, STAR); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU // Inter-device copy ctors #ifdef HYDROGEN_GPU_USE_FP16 template DistMatrix::DistMatrix( @@ -353,7 +353,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/STAR_VC.cpp b/src/core/DistMatrix/ElementMatrix/STAR_VC.cpp index a3619202d4..217314fcac 100644 --- a/src/core/DistMatrix/ElementMatrix/STAR_VC.cpp +++ b/src/core/DistMatrix/ElementMatrix/STAR_VC.cpp @@ -281,7 +281,7 @@ int DM::PartialUnionColRank() const EL_NO_EXCEPT BOTH(T,VC, STAR,Device::CPU); \ BOTH(T,VR, STAR,Device::CPU); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define INSTGPU(T,U,V) \ template DistMatrix::DistMatrix \ (DistMatrix const&); \ @@ -362,7 +362,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/STAR_VR.cpp b/src/core/DistMatrix/ElementMatrix/STAR_VR.cpp index 1b9e8afab7..a72b2943ce 100644 --- a/src/core/DistMatrix/ElementMatrix/STAR_VR.cpp +++ b/src/core/DistMatrix/ElementMatrix/STAR_VR.cpp @@ -272,7 +272,7 @@ int DM::PartialUnionColRank() const EL_NO_EXCEPT BOTH(T,VC, STAR,Device::CPU); \ BOTH(T,VR, STAR,Device::CPU); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define INSTGPU(T,U,V) \ template DistMatrix::DistMatrix \ (DistMatrix const&); \ @@ -353,7 +353,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/VC_STAR.cpp b/src/core/DistMatrix/ElementMatrix/VC_STAR.cpp index c336aa90f7..ae34eec855 100644 --- a/src/core/DistMatrix/ElementMatrix/VC_STAR.cpp +++ b/src/core/DistMatrix/ElementMatrix/VC_STAR.cpp @@ -281,7 +281,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT OTHER(T,VC, STAR,Device::CPU); \ BOTH(T,VR, STAR,Device::CPU); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define INSTGPU(T,U,V) \ template DistMatrix::DistMatrix \ (DistMatrix const&); \ @@ -362,7 +362,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/VR_STAR.cpp b/src/core/DistMatrix/ElementMatrix/VR_STAR.cpp index 721b88e80e..09b9a6cd16 100644 --- a/src/core/DistMatrix/ElementMatrix/VR_STAR.cpp +++ b/src/core/DistMatrix/ElementMatrix/VR_STAR.cpp @@ -277,7 +277,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT BOTH(T,VC, STAR,Device::CPU); \ OTHER(T,VR, STAR,Device::CPU); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define INSTGPU(T,U,V) \ template DistMatrix::DistMatrix \ (DistMatrix const&); \ @@ -358,7 +358,7 @@ DistMatrix::operator=( template DistMatrix& DistMatrix::operator=( DistMatrix const&); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/src/core/DistMatrix/ElementMatrix/setup.hpp b/src/core/DistMatrix/ElementMatrix/setup.hpp index 65723b2f8f..eb7d7467c0 100644 --- a/src/core/DistMatrix/ElementMatrix/setup.hpp +++ b/src/core/DistMatrix/ElementMatrix/setup.hpp @@ -234,10 +234,10 @@ DM::ConstructWithNewDevice(Device D2) const { case Device::CPU: return ConstructWithNewDevice_impl_(); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: return ConstructWithNewDevice_impl_(); -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Unkown device type."); } diff --git a/src/core/MemoryPool.cpp b/src/core/MemoryPool.cpp index b7af3b4e4b..1b5058f651 100644 --- a/src/core/MemoryPool.cpp +++ b/src/core/MemoryPool.cpp @@ -7,13 +7,13 @@ namespace El namespace { -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU std::unique_ptr> pinnedHostMemoryPool_; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU std::unique_ptr> hostMemoryPool_; } // namespace -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU MemoryPool& PinnedHostMemoryPool() { @@ -25,7 +25,7 @@ MemoryPool& PinnedHostMemoryPool() void DestroyPinnedHostMemoryPool() { pinnedHostMemoryPool_.reset(); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU MemoryPool& HostMemoryPool() { diff --git a/src/core/environment.cpp b/src/core/environment.cpp index 46a4b9b820..fc0db9f499 100644 --- a/src/core/environment.cpp +++ b/src/core/environment.cpp @@ -9,6 +9,12 @@ */ #include +#include + +#ifdef HYDROGEN_HAVE_GPU +#include +#endif // HYDROGEN_HAVE_GPU + #include #include @@ -216,9 +222,9 @@ void Initialize( int& argc, char**& argv ) ::args = new Args( argc, argv, mpi::COMM_WORLD, std::cerr ); -#ifdef HYDROGEN_HAVE_CUDA - InitializeCUDA(argc, argv); -#endif // HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU + gpu::Initialize(); +#endif // HYDROGEN_HAVE_GPU ::numElemInits = 1; if( !mpi::Initialized() ) @@ -296,6 +302,9 @@ void Initialize( int& argc, char**& argv ) #ifdef HYDROGEN_HAVE_CUDA cublas::Initialize(); #endif +#ifdef HYDROGEN_HAVE_ROCM + hydrogen::rocblas::Initialize(); +#endif #ifdef EL_HAVE_QT5 InitializeQt5( argc, argv ); @@ -358,8 +367,8 @@ void Finalize() FinalizeRandom(); } -#ifdef HYDROGEN_HAVE_CUDA - FinalizeCUDA(); +#ifdef HYDROGEN_HAVE_GPU + gpu::Finalize(); #endif EL_DEBUG_ONLY( CloseLog() ) diff --git a/src/core/imports/CMakeLists.txt b/src/core/imports/CMakeLists.txt index 021e2093e0..1a688f563e 100644 --- a/src/core/imports/CMakeLists.txt +++ b/src/core/imports/CMakeLists.txt @@ -12,10 +12,6 @@ set_full_path(THIS_DIR_SOURCES scalapack.cpp ) -if (HYDROGEN_HAVE_CUDA) - set_full_path(CUDA_SOURCES cuda.cpp cublas.cpp) - list(APPEND THIS_DIR_SOURCES ${CUDA_SOURCES}) -endif() if (HYDROGEN_HAVE_CUB) set_full_path(CUB_SOURCES cub.cpp) list(APPEND THIS_DIR_SOURCES ${CUB_SOURCES}) diff --git a/src/core/imports/cub.cpp b/src/core/imports/cub.cpp index 16a25730d1..61fb2ca6bc 100644 --- a/src/core/imports/cub.cpp +++ b/src/core/imports/cub.cpp @@ -1,7 +1,11 @@ -#include +#include "hydrogen/device/gpu/CUB.hpp" #include +namespace hydrogen +{ +namespace cub +{ namespace { @@ -27,7 +31,7 @@ unsigned int get_min_bin() noexcept unsigned int get_max_bin() noexcept { return get_env_uint("H_CUB_MAX_BIN", - ::cub::CachingDeviceAllocator::INVALID_BIN); + cub_impl::CachingDeviceAllocator::INVALID_BIN); } size_t get_max_cached_size() noexcept @@ -35,7 +39,7 @@ size_t get_max_cached_size() noexcept char const* env = std::getenv("H_CUB_MAX_CACHED_SIZE"); return (env ? static_cast(std::stoul(env)) - : ::cub::CachingDeviceAllocator::INVALID_SIZE); + : cub_impl::CachingDeviceAllocator::INVALID_SIZE); } bool get_debug() noexcept @@ -47,19 +51,14 @@ bool get_debug() noexcept } /** Singleton instance of CUB memory pool. */ -std::unique_ptr<::cub::CachingDeviceAllocator> memoryPool_; +std::unique_ptr memoryPool_; } // namespace -namespace hydrogen -{ -namespace cub -{ - -::cub::CachingDeviceAllocator& MemoryPool() +cub_impl::CachingDeviceAllocator& MemoryPool() { if (!memoryPool_) memoryPool_.reset( - new ::cub::CachingDeviceAllocator( + new cub_impl::CachingDeviceAllocator( get_bin_growth(), get_min_bin(), get_max_bin(), diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 661c7a0dad..9d724f516e 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -2484,7 +2484,7 @@ EL_NO_RELEASE_EXCEPT EL_NO_RELEASE_EXCEPT; \ MPI_PROTO_COMMON_DEV(Complex,D) -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define MPI_PROTO(T) \ MPI_PROTO_DEVICELESS(T) \ MPI_PROTO_DEV(T, Device::CPU) \ @@ -2506,7 +2506,7 @@ MPI_PROTO(Entry) #define MPI_PROTO_COMPLEX(T) \ MPI_PROTO_DEVICELESS_COMPLEX(T) \ MPI_PROTO_COMPLEX_DEV(T, Device::CPU) -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU MPI_PROTO(byte) MPI_PROTO(int) diff --git a/src/core/imports/mpi/AllGather.hpp b/src/core/imports/mpi/AllGather.hpp index bfff12bb6a..c7c7d3d2e4 100644 --- a/src/core/imports/mpi/AllGather.hpp +++ b/src/core/imports/mpi/AllGather.hpp @@ -132,14 +132,14 @@ void AllGather( template void AllGather(const T* sbuf, int sc, T* rbuf, int rc, Comm const& comm, \ SyncInfo const&) -#ifndef HYDROGEN_HAVE_CUDA +#ifndef HYDROGEN_HAVE_GPU #define MPI_ALLGATHER_PROTO(T) \ MPI_ALLGATHER_PROTO_DEV(T,Device::CPU) #else #define MPI_ALLGATHER_PROTO(T) \ MPI_ALLGATHER_PROTO_DEV(T,Device::CPU); \ MPI_ALLGATHER_PROTO_DEV(T,Device::GPU) -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU MPI_ALLGATHER_PROTO(byte); MPI_ALLGATHER_PROTO(int); diff --git a/src/core/imports/mpi/AllReduce.hpp b/src/core/imports/mpi/AllReduce.hpp index 2ad7b151db..f916d97518 100644 --- a/src/core/imports/mpi/AllReduce.hpp +++ b/src/core/imports/mpi/AllReduce.hpp @@ -254,14 +254,14 @@ void AllReduce(T* buf, int count, Comm const& comm, SyncInfo const& syncInfo) template T AllReduce(T, Comm const&, SyncInfo const&); \ template void AllReduce(T*, int, Comm const&, SyncInfo const&) -#ifndef HYDROGEN_HAVE_CUDA +#ifndef HYDROGEN_HAVE_GPU #define MPI_ALLREDUCE_PROTO(T) \ MPI_ALLREDUCE_PROTO_DEV(T,Device::CPU) #else #define MPI_ALLREDUCE_PROTO(T) \ MPI_ALLREDUCE_PROTO_DEV(T,Device::CPU); \ MPI_ALLREDUCE_PROTO_DEV(T,Device::GPU) -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU MPI_ALLREDUCE_PROTO(byte); MPI_ALLREDUCE_PROTO(int); diff --git a/src/core/imports/mpi/AllToAll.hpp b/src/core/imports/mpi/AllToAll.hpp index 7b2d08a6a0..a0cff86729 100644 --- a/src/core/imports/mpi/AllToAll.hpp +++ b/src/core/imports/mpi/AllToAll.hpp @@ -110,14 +110,14 @@ void AllToAll(T const*, int, T*, int, Comm const&, SyncInfo const&) #define MPI_ALLTOALL_PROTO_DEV(T,D) \ template void AllToAll(T const*, int, T*, int, Comm const&, SyncInfo const&) -#ifndef HYDROGEN_HAVE_CUDA +#ifndef HYDROGEN_HAVE_GPU #define MPI_ALLTOALL_PROTO(T) \ MPI_ALLTOALL_PROTO_DEV(T,Device::CPU) #else #define MPI_ALLTOALL_PROTO(T) \ MPI_ALLTOALL_PROTO_DEV(T,Device::CPU); \ MPI_ALLTOALL_PROTO_DEV(T,Device::GPU) -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU MPI_ALLTOALL_PROTO(byte); MPI_ALLTOALL_PROTO(int); diff --git a/src/core/imports/mpi/Broadcast.hpp b/src/core/imports/mpi/Broadcast.hpp index 92b5747381..04a3fb859b 100644 --- a/src/core/imports/mpi/Broadcast.hpp +++ b/src/core/imports/mpi/Broadcast.hpp @@ -111,14 +111,14 @@ void Broadcast( T& b, int root, Comm const& comm, SyncInfo const& syncInfo ) template void Broadcast(T*, int, int, Comm const&, SyncInfo const&); \ template void Broadcast(T&, int, Comm const&, SyncInfo const&) -#ifndef HYDROGEN_HAVE_CUDA +#ifndef HYDROGEN_HAVE_GPU #define MPI_BROADCAST_PROTO(T) \ MPI_BROADCAST_PROTO_DEV(T,Device::CPU) #else #define MPI_BROADCAST_PROTO(T) \ MPI_BROADCAST_PROTO_DEV(T,Device::CPU); \ MPI_BROADCAST_PROTO_DEV(T,Device::GPU) -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU MPI_BROADCAST_PROTO(byte); MPI_BROADCAST_PROTO(int); diff --git a/src/core/imports/mpi/Gather.hpp b/src/core/imports/mpi/Gather.hpp index f0d3d3a7e7..135f7d4225 100644 --- a/src/core/imports/mpi/Gather.hpp +++ b/src/core/imports/mpi/Gather.hpp @@ -126,7 +126,7 @@ void Gather(const T*, int, T*, int, int, Comm const&, SyncInfo const&) template void Gather(const Complex* sbuf, int sc, Complex* rbuf, \ int rc, int root, Comm const& comm, SyncInfo const&); -#ifndef HYDROGEN_HAVE_CUDA +#ifndef HYDROGEN_HAVE_GPU #define MPI_COLLECTIVE_PROTO(T) \ MPI_COLLECTIVE_PROTO_DEV(T,Device::CPU) #define MPI_COLLECTIVE_COMPLEX_PROTO(T) \ @@ -138,7 +138,7 @@ void Gather(const T*, int, T*, int, int, Comm const&, SyncInfo const&) #define MPI_COLLECTIVE_COMPLEX_PROTO(T) \ MPI_COLLECTIVE_COMPLEX_PROTO_DEV(T,Device::CPU) \ MPI_COLLECTIVE_COMPLEX_PROTO_DEV(T,Device::GPU) -#endif +#endif // HYDROGEN_HAVE_GPU MPI_COLLECTIVE_PROTO(byte) MPI_COLLECTIVE_PROTO(int) diff --git a/src/core/imports/mpi/Reduce.hpp b/src/core/imports/mpi/Reduce.hpp index e00e62bccf..4d20b53cf9 100644 --- a/src/core/imports/mpi/Reduce.hpp +++ b/src/core/imports/mpi/Reduce.hpp @@ -317,14 +317,14 @@ void Reduce(T* buf, int count, int root, Comm const& comm, template T Reduce(T, int, Comm const&, SyncInfo const&); \ template void Reduce(T*, int, int, Comm const&, SyncInfo const&) -#ifndef HYDROGEN_HAVE_CUDA +#ifndef HYDROGEN_HAVE_GPU #define MPI_REDUCE_PROTO(T) \ MPI_REDUCE_PROTO_DEV(T,Device::CPU) #else #define MPI_REDUCE_PROTO(T) \ MPI_REDUCE_PROTO_DEV(T,Device::CPU); \ MPI_REDUCE_PROTO_DEV(T,Device::GPU) -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU MPI_REDUCE_PROTO(byte); MPI_REDUCE_PROTO(int); diff --git a/src/core/imports/mpi/ReduceScatter.hpp b/src/core/imports/mpi/ReduceScatter.hpp index 9d339b6dc8..d5f6bb9f43 100644 --- a/src/core/imports/mpi/ReduceScatter.hpp +++ b/src/core/imports/mpi/ReduceScatter.hpp @@ -1,4 +1,7 @@ // ReduceScatter +#ifdef HYDROGEN_HAVE_GPU +#include "hydrogen/device/gpu/BasicCopy.hpp" +#endif // HYDROGEN_HAVE_GPU namespace El { @@ -15,18 +18,16 @@ void LocalCopy(T const* EL_RESTRICT src, return std::copy_n(src, size, dest); } -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void LocalCopy(T const* EL_RESTRICT src, T* EL_RESTRICT dest, size_t size, SyncInfo const& si) { - H_CHECK_CUDA(cudaMemcpyAsync(dest, src, sizeof(T)*size, - cudaMemcpyDeviceToDevice, - si.stream_)); + gpu::Copy1DIntraDevice(src, dest, size, si); } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU } // IsValidAluminumDeviceType should mean both that the device/type @@ -290,14 +291,14 @@ void ReduceScatter(T* buf, int rc, Comm const& comm, SyncInfo const& syncInfo template T ReduceScatter(T, Comm const&, SyncInfo const&); \ template void ReduceScatter(T*, int, Comm const&, SyncInfo const&) -#ifndef HYDROGEN_HAVE_CUDA +#ifndef HYDROGEN_HAVE_GPU #define MPI_REDUCESCATTER_PROTO(T) \ MPI_REDUCESCATTER_PROTO_DEV(T,Device::CPU) #else #define MPI_REDUCESCATTER_PROTO(T) \ MPI_REDUCESCATTER_PROTO_DEV(T,Device::CPU); \ MPI_REDUCESCATTER_PROTO_DEV(T,Device::GPU) -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU MPI_REDUCESCATTER_PROTO(byte); MPI_REDUCESCATTER_PROTO(int); diff --git a/src/core/imports/mpi/Scatter.hpp b/src/core/imports/mpi/Scatter.hpp index d3c21f4555..6e316185b7 100644 --- a/src/core/imports/mpi/Scatter.hpp +++ b/src/core/imports/mpi/Scatter.hpp @@ -117,7 +117,7 @@ void Scatter(const T*, int, T*, int, int, Comm const&, SyncInfo const&) template void Scatter(const T* sbuf, int sc, T* rbuf, int rc, int root, \ Comm const& comm, SyncInfo const&) -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU #define MPI_COLLECTIVE_PROTO(T) \ MPI_COLLECTIVE_PROTO_DEV(T,Device::CPU); \ MPI_COLLECTIVE_PROTO_DEV(T,Device::GPU) diff --git a/src/core/imports/mpi/SendRecv.hpp b/src/core/imports/mpi/SendRecv.hpp index 8e9e12dd5c..75326bf87a 100644 --- a/src/core/imports/mpi/SendRecv.hpp +++ b/src/core/imports/mpi/SendRecv.hpp @@ -79,7 +79,7 @@ T SendRecv( T sb, int to, int from, Comm const& comm, SyncInfo const& syncInf T* buf, int count, int to, int from, Comm const& comm, \ SyncInfo const&) -#ifndef HYDROGEN_HAVE_CUDA +#ifndef HYDROGEN_HAVE_GPU #define MPI_COLLECTIVE_PROTO(T) \ MPI_COLLECTIVE_PROTO_DEV(T,Device::CPU) #else diff --git a/src/core/imports/mpi_utils.hpp b/src/core/imports/mpi_utils.hpp index b26fd0fc7d..d5a84f78f7 100644 --- a/src/core/imports/mpi_utils.hpp +++ b/src/core/imports/mpi_utils.hpp @@ -17,7 +17,14 @@ #ifndef EL_IMPORTS_MPIUTILS_HPP #define EL_IMPORTS_MPIUTILS_HPP -namespace { +#include + +#ifdef HYDROGEN_HAVE_GPU +#include +#endif + +namespace +{ template MPI_Op NativeOp( const El::mpi::Op& op ) @@ -83,7 +90,7 @@ class PassthroughMemoryWrapper template class ManagedHostMemoryWrapper; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template class ManagedHostMemoryWrapper { @@ -101,23 +108,32 @@ class ManagedHostMemoryWrapper final_xfer_size_{final_xfer_size} { if ((host_data_.size() > 0) && (initial_xfer_size > 0)) - InterDeviceCopy::MemCopy1DAsync( - host_data_.data()+initial_xfer_offset, + { + gpu::Copy1DToHost( device_data_+initial_xfer_offset, - initial_xfer_size, syncInfo_.stream_); + host_data_.data()+initial_xfer_offset, + initial_xfer_size, syncInfo_); + } } ~ManagedHostMemoryWrapper() { // Transfer stuff back to device - if ((host_data_.size() > 0) && (final_xfer_size_ > 0)) + try { - InterDeviceCopy::MemCopy1DAsync( - device_data_+final_xfer_offset_, - host_data_.data()+final_xfer_offset_, - final_xfer_size_, syncInfo_.stream_); + if ((host_data_.size() > 0) && (final_xfer_size_ > 0)) + { + gpu::Copy1DToDevice( + host_data_.data()+final_xfer_offset_, + device_data_+final_xfer_offset_, + final_xfer_size_, syncInfo_); + } Synchronize(syncInfo_); } + catch (std::exception const& e) + { + H_REPORT_DTOR_EXCEPTION_AND_TERMINATE(e); + } } // Enable move construction/assignment @@ -135,7 +151,7 @@ class ManagedHostMemoryWrapper size_t final_xfer_offset_; size_t final_xfer_size_; };// class ManagedHostMemoryWrapper -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template auto MakeHostBuffer(T* buf, size_t const& size, @@ -158,7 +174,7 @@ MakeManagedHostBuffer(T* buf, size_t const&, size_t const&, size_t const&, template struct type_check; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU // This can't (shouldn't) just be std::vector because we want // pinned memory for GPUs. And I don't want to write a new Allocator // for std::vector that uses pinned memory through CUDA. We can access @@ -170,11 +186,10 @@ auto MakeHostBuffer(T const* buf, size_t const& size, { simple_buffer locbuf( size, SyncInfo{}, /*mode=*/ 1); - InterDeviceCopy::MemCopy1DAsync( - locbuf.data(), buf, size, syncInfo.stream_); + gpu::Copy1DToHost(buf, locbuf.data(), size, syncInfo); return locbuf; } -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU template auto MakeManagedHostBuffer( diff --git a/src/core/mpi_register.cpp b/src/core/mpi_register.cpp index 0712caf66d..f9ffa357c6 100644 --- a/src/core/mpi_register.cpp +++ b/src/core/mpi_register.cpp @@ -6,6 +6,7 @@ which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ +#define H_INSTANTIATING_MPI_TYPES_STRUCT #include using std::function; diff --git a/src/hydrogen/CMakeLists.txt b/src/hydrogen/CMakeLists.txt index da07066b1e..a171ea49ea 100644 --- a/src/hydrogen/CMakeLists.txt +++ b/src/hydrogen/CMakeLists.txt @@ -1,6 +1,12 @@ add_subdirectory(blas) +add_subdirectory(device) set(SOURCES "${SOURCES}" PARENT_SCOPE) if (HYDROGEN_HAVE_GPU) - set(CUDA_SOURCES "${CUDA_SOURCES}" PARENT_SCOPE) + set(GPU_SOURCES "${GPU_SOURCES}" PARENT_SCOPE) endif () + +set_full_path(THIS_DIR_CXX_SOURCES + Error.cpp) + +set(SOURCES "${SOURCES}" "${THIS_DIR_CXX_SOURCES}" PARENT_SCOPE) diff --git a/src/hydrogen/Error.cpp b/src/hydrogen/Error.cpp new file mode 100644 index 0000000000..44df791f3b --- /dev/null +++ b/src/hydrogen/Error.cpp @@ -0,0 +1,15 @@ +#include + +namespace hydrogen +{ +namespace +{ +volatile size_t break_on_me_called_ = 0UL; +} + +void break_on_me() +{ + break_on_me_called_ += 1UL; +} + +}// namespace hydrogen diff --git a/src/hydrogen/blas/CMakeLists.txt b/src/hydrogen/blas/CMakeLists.txt index 5487e6c7d6..3c47c8bd00 100644 --- a/src/hydrogen/blas/CMakeLists.txt +++ b/src/hydrogen/blas/CMakeLists.txt @@ -1,5 +1,5 @@ if (HYDROGEN_HAVE_GPU) add_subdirectory(gpu) - set(CUDA_SOURCES "${CUDA_SOURCES}" PARENT_SCOPE) + set(GPU_SOURCES "${GPU_SOURCES}" PARENT_SCOPE) endif () diff --git a/src/hydrogen/blas/gpu/Axpy.cu b/src/hydrogen/blas/gpu/Axpy.cu index 1773463df3..e238c3af8c 100644 --- a/src/hydrogen/blas/gpu/Axpy.cu +++ b/src/hydrogen/blas/gpu/Axpy.cu @@ -2,11 +2,16 @@ #include #include -#include +#ifdef HYDROGEN_HAVE_CUDA +#include #include #include namespace cg = cooperative_groups; +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#include +#endif namespace { @@ -20,7 +25,9 @@ __global__ void axpy_2d_transpose_tiled_kernel( // All the fun of a transpose meets the awesomeness of Axpy. :D // // remember: B is m x n, A is n x m +#ifdef HYDROGEN_HAVE_CUDA cg::thread_block cta = cg::this_thread_block(); +#endif __shared__ T tile[TILE_SIZE][TILE_SIZE+1]; auto const row_start_A = blockIdx.y * TILE_SIZE + threadIdx.x; @@ -49,7 +56,11 @@ __global__ void axpy_2d_transpose_tiled_kernel( } } +#ifdef HYDROGEN_HAVE_CUDA cg::sync(cta); +#else + __syncthreads(); +#endif // If I am a valid row in B, I need to store things if (row_start_B < m) @@ -109,7 +120,7 @@ void Axpy_GPU_impl( T alpha, T const* X, SizeT colStrideX, SizeT rowStrideX, T* Y, SizeT colStrideY, SizeT rowStrideY, - cudaStream_t stream) + SyncInfo const& sync_info) { if (height == TypeTraits::Zero() || width == TypeTraits::Zero()) @@ -127,14 +138,13 @@ void Axpy_GPU_impl( dim3 blks((height + TILE_SIZE - 1) / TILE_SIZE, (width + TILE_SIZE - 1) / TILE_SIZE, 1); dim3 thds(TILE_SIZE, BLK_COLS, 1); - void* args[] = {&height, &width, &alpha, - &X, &colStrideX, &rowStrideX, - &Y, &colStrideY, &rowStrideY}; - - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)&axpy_2d_tiled_kernel, - blks, thds, args, 0, stream)); + + gpu::LaunchKernel( + axpy_2d_tiled_kernel, + blks, thds, 0, sync_info, + height, width, alpha, + X, colStrideX, rowStrideX, + Y, colStrideY, rowStrideY); } template @@ -144,7 +154,7 @@ void Axpy_GPU_impl( T alpha, T const* A, SizeT lda, T* B, SizeT ldb, - cudaStream_t stream) + SyncInfo const& sync_info) { // Short-circuit if (height <= TypeTraits::Zero() @@ -157,7 +167,7 @@ void Axpy_GPU_impl( return Axpy_GPU_impl( height, width, alpha, A, TypeTraits::One(), lda, - B, TypeTraits::One(), ldb, stream); + B, TypeTraits::One(), ldb, sync_info); constexpr int TILE_SIZE = 32; constexpr int BLK_COLS = 8; @@ -165,24 +175,22 @@ void Axpy_GPU_impl( dim3 blks((height + TILE_SIZE - 1) / TILE_SIZE, (width + TILE_SIZE - 1) / TILE_SIZE, 1); dim3 thds(TILE_SIZE, BLK_COLS, 1); - void* args[] = {&height, &width, &alpha, &A, &lda, &B, &ldb}; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)&axpy_2d_transpose_tiled_kernel - , - blks, thds, args, 0, stream)); + gpu::LaunchKernel( + axpy_2d_transpose_tiled_kernel, + blks, thds, 0, sync_info, + height, width, alpha, A, lda, B, ldb); } -#define ETI(ScalarT, SizeT) \ - template void Axpy_GPU_impl( \ - SizeT, SizeT, ScalarT, \ - ScalarT const*, SizeT, SizeT, \ - ScalarT*, SizeT, SizeT, cudaStream_t); \ - template void Axpy_GPU_impl( \ - TransposeMode, SizeT, SizeT, ScalarT, \ - ScalarT const*, SizeT, \ - ScalarT*, SizeT, cudaStream_t) +#define ETI(ScalarT, SizeT) \ + template void Axpy_GPU_impl( \ + SizeT, SizeT, ScalarT, \ + ScalarT const*, SizeT, SizeT, \ + ScalarT*, SizeT, SizeT, SyncInfo const&); \ + template void Axpy_GPU_impl( \ + TransposeMode, SizeT, SizeT, ScalarT, \ + ScalarT const*, SizeT, \ + ScalarT*, SizeT, SyncInfo const&) #ifdef HYDROGEN_GPU_USE_FP16 diff --git a/src/hydrogen/blas/gpu/CMakeLists.txt b/src/hydrogen/blas/gpu/CMakeLists.txt index 6b3fa46edb..a39e62bd4c 100644 --- a/src/hydrogen/blas/gpu/CMakeLists.txt +++ b/src/hydrogen/blas/gpu/CMakeLists.txt @@ -1,4 +1,4 @@ -set_full_path(THIS_DIR_CUDA_SOURCES +set_full_path(THIS_DIR_GPU_SOURCES Axpy.cu Copy.cu Fill.cu @@ -8,4 +8,4 @@ set_full_path(THIS_DIR_CUDA_SOURCES ) # Propagate the files up the tree -set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CUDA_SOURCES}" PARENT_SCOPE) +set(GPU_SOURCES "${GPU_SOURCES}" "${THIS_DIR_GPU_SOURCES}" PARENT_SCOPE) diff --git a/src/hydrogen/blas/gpu/Copy.cu b/src/hydrogen/blas/gpu/Copy.cu index ee03c7e401..aa5abbca22 100644 --- a/src/hydrogen/blas/gpu/Copy.cu +++ b/src/hydrogen/blas/gpu/Copy.cu @@ -2,9 +2,14 @@ #include #include -#include +#ifdef HYDROGEN_HAVE_CUDA +#include #include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#include +#endif namespace { @@ -80,7 +85,7 @@ void Copy_GPU_impl( SizeT num_entries, SrcT const* src, SizeT src_stride, DestT * dest, SizeT dest_stride, - cudaStream_t stream) + SyncInfo const& sync_info) { if (num_entries <= TypeTraits::Zero()) return; @@ -97,13 +102,12 @@ void Copy_GPU_impl( constexpr size_t threads_per_block = 128; auto blocks = (num_entries + threads_per_block - 1)/ threads_per_block; - void* args[] = { &num_entries, &src, &src_stride, &dest, &dest_stride }; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)©_1d_kernel, - blocks, threads_per_block, - args, 0, stream)); + gpu::LaunchKernel( + copy_1d_kernel, + blocks, threads_per_block, + 0, sync_info, + num_entries, src, src_stride, dest, dest_stride); } template @@ -111,7 +115,7 @@ void Copy_GPU_impl( SizeT num_rows, SizeT num_cols, SrcT const* src, SizeT src_row_stride, SizeT src_col_stride, DestT* dest, SizeT dest_row_stride, SizeT dest_col_stride, - cudaStream_t stream) + SyncInfo const& sync_info) { if (num_rows == 0 || num_cols == 0) return; @@ -132,24 +136,23 @@ void Copy_GPU_impl( dim3 blks((num_rows + TILE_SIZE - 1)/TILE_SIZE, (num_cols + TILE_SIZE - 1)/TILE_SIZE, 1); dim3 thds(TILE_SIZE, BLK_COLS, 1); - void* args[] = { &num_rows, &num_cols, - &src, &src_row_stride, &src_col_stride, - &dest, &dest_row_stride, &dest_col_stride }; - - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)©_2d_kernel, - blks, thds, args, 0, stream)); + + gpu::LaunchKernel( + copy_2d_kernel, + blks, thds, 0, sync_info, + num_rows, num_cols, + src, src_row_stride, src_col_stride, + dest, dest_row_stride, dest_col_stride); } -#define ETI(SourceType, DestType, SizeType) \ - template void Copy_GPU_impl( \ - SizeType, SourceType const*, SizeType, \ - DestType*, SizeType, cudaStream_t); \ - template void Copy_GPU_impl( \ - SizeType, SizeType, \ - SourceType const*, SizeType, SizeType, \ - DestType*, SizeType, SizeType, cudaStream_t) +#define ETI(SourceType, DestType, SizeType) \ + template void Copy_GPU_impl( \ + SizeType, SourceType const*, SizeType, \ + DestType*, SizeType, SyncInfo const&); \ + template void Copy_GPU_impl( \ + SizeType, SizeType, \ + SourceType const*, SizeType, SizeType, \ + DestType*, SizeType, SizeType, SyncInfo const&) ETI(float, float, int); ETI(float, float, long); diff --git a/src/hydrogen/blas/gpu/Fill.cu b/src/hydrogen/blas/gpu/Fill.cu index 456d8608bd..85c1590d1f 100644 --- a/src/hydrogen/blas/gpu/Fill.cu +++ b/src/hydrogen/blas/gpu/Fill.cu @@ -2,9 +2,14 @@ #include #include -#include +#ifdef HYDROGEN_HAVE_CUDA +#include #include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#include +#endif namespace hydrogen { @@ -36,25 +41,13 @@ __global__ void Fill2D_kernel(size_t height, size_t width, T value, } } -template -bool CompareEqual(T const& a, T const& b) -{ - return a == b; -} - -#ifdef HYDROGEN_GPU_USE_FP16 -inline bool CompareEqual(gpu_half_type const& a, gpu_half_type const& b) -{ - return float(a) == float(b); -} -#endif // HYDROGEN_GPU_USE_FP16 - }// namespace template void Fill_GPU_impl( size_t height, size_t width, T const& value, - T* buffer, size_t ldim, cudaStream_t stream) + T* buffer, size_t ldim, + SyncInfo const& sync_info) { if (height <= 0 || width <= 0) return; @@ -62,49 +55,28 @@ void Fill_GPU_impl( size_t size = height * width; constexpr size_t blockDim = 256; const size_t gridDim = (size + blockDim - 1) / blockDim; - if (CompareEqual(value, TypeTraits::Zero())) + + if (width == 1 || ldim == height) { - if (width == 1 || ldim == height) - { - H_CHECK_CUDA(cudaMemsetAsync(buffer, 0x0, size*sizeof(T), - stream)); - } - else - { - H_CHECK_CUDA( - cudaMemset2DAsync( - buffer, ldim*sizeof(T), 0x0, - height*sizeof(T), width, - stream)); - } + gpu::LaunchKernel( + Fill1D_kernel, + gridDim, blockDim, 0, sync_info, + size, value, buffer); } else { - T arg_value = value; - if (width == 1 || ldim == height) - { - void* args[] = {&size, &arg_value, &buffer}; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)&Fill1D_kernel, - gridDim, blockDim, args, 0, stream)); - - } - else - { - void* args[] = {&height, &width, &arg_value, &buffer, &ldim}; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)&Fill2D_kernel, - gridDim, blockDim, args, 0, stream)); - } + gpu::LaunchKernel( + Fill2D_kernel, + gridDim, blockDim, 0, sync_info, + height, width, value, buffer, ldim); } } -#define ETI(T) \ - template void Fill_GPU_impl( \ - size_t, size_t, T const&, T*, size_t, cudaStream_t) +#define ETI(T) \ + template void Fill_GPU_impl( \ + size_t, size_t, T const&, T*, size_t, \ + SyncInfo const&) #ifdef HYDROGEN_GPU_USE_FP16 ETI(gpu_half_type); diff --git a/src/hydrogen/blas/gpu/Hadamard.cu b/src/hydrogen/blas/gpu/Hadamard.cu index 1f838efa45..923830b499 100644 --- a/src/hydrogen/blas/gpu/Hadamard.cu +++ b/src/hydrogen/blas/gpu/Hadamard.cu @@ -1,8 +1,13 @@ #include #include +#ifdef HYDROGEN_HAVE_CUDA #include #include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#include +#endif namespace { @@ -61,7 +66,7 @@ void Hadamard_GPU_impl( T const* X, size_t colStrideX, size_t rowStrideX, T const* Y, size_t colStrideY, size_t rowStrideY, T* Z, size_t colStrideZ, size_t rowStrideZ, - cudaStream_t stream) + SyncInfo const& sync_info) { if (height <= 0 || width <= 0) { return; } size_t size = height * width; @@ -73,49 +78,45 @@ void Hadamard_GPU_impl( { if (X == Z) { - void* args[] = { &size, &Y, &Z }; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)&MultAssign_kernel, - gridDim, blockDim, args, 0, stream)); + gpu::LaunchKernel( + MultAssign_kernel, + gridDim, blockDim, 0, sync_info, + size, Y, Z); } else if (Y == Z) { - void* args[] = { &size, &X, &Z }; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)&MultAssign_kernel, - gridDim, blockDim, args, 0, stream)); + gpu::LaunchKernel( + MultAssign_kernel, + gridDim, blockDim, 0, sync_info, + size, X, Z); } else { - void* args[] = { &size, &X, &Y, &Z }; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)&Hadamard1D_kernel, - gridDim, blockDim, args, 0, stream)); + gpu::LaunchKernel( + Hadamard1D_kernel, + gridDim, blockDim, 0, sync_info, + size, X, Y, Z); } } else { - void* args[] = { &height, &width, - &X, &colStrideX, &rowStrideX, - &Y, &colStrideY, &rowStrideY, - &Z, &colStrideZ, &rowStrideZ }; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)&Hadamard2D_kernel, - gridDim, blockDim, args, 0, stream)); + gpu::LaunchKernel( + Hadamard2D_kernel, + gridDim, blockDim, 0, sync_info, + height, width, + X, colStrideX, rowStrideX, + Y, colStrideY, rowStrideY, + Z, colStrideZ, rowStrideZ); } } -#define ETI(T) \ - template void Hadamard_GPU_impl( \ - size_t, size_t, \ - T const*, size_t, size_t, \ - T const*, size_t, size_t, \ - T*, size_t, size_t, cudaStream_t) +#define ETI(T) \ + template void Hadamard_GPU_impl( \ + size_t, size_t, \ + T const*, size_t, size_t, \ + T const*, size_t, size_t, \ + T*, size_t, size_t, SyncInfo const&) #ifdef HYDROGEN_GPU_USE_FP16 ETI(gpu_half_type); diff --git a/src/hydrogen/blas/gpu/Scale.cu b/src/hydrogen/blas/gpu/Scale.cu index f55bcde28f..afc3abb3a5 100644 --- a/src/hydrogen/blas/gpu/Scale.cu +++ b/src/hydrogen/blas/gpu/Scale.cu @@ -2,9 +2,13 @@ #include #include +#ifdef HYDROGEN_HAVE_CUDA #include - #include +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#include +#endif namespace { @@ -41,26 +45,24 @@ template void Scale_GPU_impl( SizeT num_entries, T const& alpha, T* A, SizeT lda, - cudaStream_t stream) + SyncInfo const& sync_info) { if (!num_entries) return; constexpr size_t threads_per_block = 128; auto blocks = (num_entries + threads_per_block - 1)/ threads_per_block; - T arg_alpha = alpha; - void* args[] = { &num_entries, &arg_alpha, &A, &lda}; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)&scale_1d_kernel_naive, - blocks, threads_per_block, args, 0, stream)); + gpu::LaunchKernel( + scale_1d_kernel_naive, + blocks, threads_per_block, 0, sync_info, + num_entries, alpha, A, lda); } template void Scale_GPU_impl( SizeT num_rows, SizeT num_cols, T const& alpha, T* A, SizeT lda, - cudaStream_t stream) + SyncInfo const& sync_info) { if (num_rows == TypeTraits::Zero() || num_cols == TypeTraits::Zero()) @@ -76,24 +78,21 @@ void Scale_GPU_impl( 1); dim3 thds(TILE_DIM, BLK_COLS, 1); - T arg_alpha = alpha; - void* args[] = { &num_rows, &num_cols, - &arg_alpha, &A, &lda}; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)&scale_2d_kernel_naive, - blks, thds, args, 0, stream)); + gpu::LaunchKernel( + scale_2d_kernel_naive, + blks, thds, 0, sync_info, + num_rows, num_cols, alpha, A, lda); } #define ETI(DataType, SizeType) \ template void Scale_GPU_impl( \ SizeType, \ DataType const&, DataType*, SizeType, \ - cudaStream_t); \ + SyncInfo const&); \ template void Scale_GPU_impl( \ SizeType, SizeType, \ DataType const&, DataType*, SizeType, \ - cudaStream_t) + SyncInfo const&) ETI(float, int); ETI(float, long); diff --git a/src/hydrogen/blas/gpu/Transpose.cu b/src/hydrogen/blas/gpu/Transpose.cu index e48bf5497d..66607d028e 100644 --- a/src/hydrogen/blas/gpu/Transpose.cu +++ b/src/hydrogen/blas/gpu/Transpose.cu @@ -2,12 +2,15 @@ #include #include +#ifdef HYDROGEN_HAVE_CUDA #include - #include - #include namespace cg = cooperative_groups; +#elif defined(HYDROGEN_HAVE_ROCM) +#include +#include +#endif namespace { @@ -19,7 +22,9 @@ __global__ void transpose_kernel( T const* __restrict__ A, SizeT const lda, T* __restrict__ B, SizeT const ldb) { +#ifdef HYDROGEN_HAVE_CUDA cg::thread_block cta = cg::this_thread_block(); +#endif __shared__ T tile[TILE_DIM][TILE_DIM+1]; SizeT row_idx_A = blockIdx.x * TILE_DIM + threadIdx.x; @@ -48,7 +53,11 @@ __global__ void transpose_kernel( tile[threadIdx.y+ii][threadIdx.x] = A[idx_in + ii*lda]; } +#ifdef HYDROGEN_HAVE_CUDA cg::sync(cta); +#else + __syncthreads(); +#endif #pragma unroll for (int ii = 0; ii < TILE_DIM; ii += BLK_COLS) @@ -70,7 +79,11 @@ __global__ void transpose_kernel( } // Same warp-sync stuff -- I assume this still needs to happen. +#ifdef HYDROGEN_HAVE_CUDA cg::sync(cta); +#else + __syncthreads(); +#endif // Don't write rows of the new matrix that don't exist. if (row_idx < n) @@ -90,7 +103,7 @@ namespace hydrogen template void Transpose_GPU_impl( SizeT m, SizeT n, T const* A, SizeT lda, T* B, SizeT ldb, - cudaStream_t stream) + SyncInfo const& sync_info) { if (m == TypeTraits::Zero() || n == TypeTraits::Zero()) return; @@ -102,19 +115,18 @@ void Transpose_GPU_impl( (n + TILE_DIM - 1) / TILE_DIM, 1); dim3 thds(TILE_DIM, BLK_COLS, 1); - void* args[] = { &m, &n, &A, &lda, &B, &ldb }; - H_CHECK_CUDA( - cudaLaunchKernel( - (void const*)transpose_kernel, - blks, thds, args, 0, stream)); + gpu::LaunchKernel( + transpose_kernel, + blks, thds, 0, sync_info, + m, n, A, lda, B, ldb); } -#define ETI(DataType, SizeType) \ - template void Transpose_GPU_impl( \ - SizeType, SizeType, \ - DataType const*, SizeType, \ - DataType*, SizeType, cudaStream_t) +#define ETI(DataType, SizeType) \ + template void Transpose_GPU_impl( \ + SizeType, SizeType, \ + DataType const*, SizeType, \ + DataType*, SizeType, SyncInfo const&) ETI(float, int); ETI(float, long); diff --git a/src/hydrogen/device/CMakeLists.txt b/src/hydrogen/device/CMakeLists.txt new file mode 100644 index 0000000000..e205b84614 --- /dev/null +++ b/src/hydrogen/device/CMakeLists.txt @@ -0,0 +1,19 @@ +if (HYDROGEN_HAVE_GPU) + if (HYDROGEN_HAVE_CUDA) + set_full_path(THIS_DIR_CXX_SOURCES + CUDA.cpp + cuBLAS.cpp + cuBLAS_API.cpp + GPU.cpp) + endif () + if (HYDROGEN_HAVE_ROCM) + set_full_path(THIS_DIR_CXX_SOURCES + GPU.cpp + ROCm.cpp + rocBLAS.cpp + rocBLAS_API.cpp + ) + endif () + + set(SOURCES "${SOURCES}" "${THIS_DIR_CXX_SOURCES}" PARENT_SCOPE) +endif () diff --git a/src/hydrogen/device/CUDA.cpp b/src/hydrogen/device/CUDA.cpp new file mode 100644 index 0000000000..952eb58141 --- /dev/null +++ b/src/hydrogen/device/CUDA.cpp @@ -0,0 +1,163 @@ +#include "El/hydrogen_config.h" + +#include "hydrogen/device/GPU.hpp" +#include "hydrogen/device/gpu/CUDA.hpp" + +#include "hydrogen/Device.hpp" +#include "hydrogen/Error.hpp" +#include "hydrogen/SyncInfo.hpp" + +#include "El/core/MemoryPool.hpp" + +#include + +#include +#include + +#define H_CHECK_NVML(cmd) \ + { \ + auto h_check_nvml_error_code = cmd; \ + H_ASSERT(h_check_nvml_error_code == NVML_SUCCESS, \ + NVMLError, \ + BuildNVMLErrorMessage(#cmd, \ + h_check_nvml_error_code)); \ + } + +namespace hydrogen +{ +namespace gpu +{ +namespace +{ + +/** @class NVMLError + * @brief Exception class for errors detected in NVML + */ +H_ADD_BASIC_EXCEPTION_CLASS(NVMLError, GPUError);// struct NVMLError + +/** @brief Write an error message describing what went wrong in NVML + * @param[in] cmd The expression that raised the error. + * @param[in] error_code The error code reported by NVML. + * @returns A string describing the error. + */ +std::string BuildNVMLErrorMessage( + std::string const& cmd, nvmlReturn_t error_code) +{ + std::ostringstream oss; + oss << "NVML error detected in command: \"" << cmd << "\"\n\n" + << " Error Code: " << error_code << "\n" + << " Error Mesg: " << nvmlErrorString(error_code) << "\n"; + return oss.str(); +} + +unsigned int PreCUDAInitDeviceCount() +{ + unsigned int count; + H_CHECK_NVML(nvmlInit()); + H_CHECK_NVML(nvmlDeviceGetCount(&count)); + H_CHECK_NVML(nvmlShutdown()); + return count; +} + +}// namespace hydrogen::gpu:: + +// +// GPU.hpp functions +// + +int DefaultDevice() +{ + static int device_id = + ComputeDeviceId(PreCUDAInitDeviceCount()); + return device_id; +} + +size_t DeviceCount() +{ + int count; + H_CHECK_CUDA(cudaGetDeviceCount(&count)); + return static_cast(count); +} + +int CurrentDevice() +{ + int device; + H_CHECK_CUDA(cudaGetDevice(&device)); + return device; +} + +void SetDevice(int device_id) +{ + H_CHECK_CUDA(cudaSetDevice(device_id)); + H_CHECK_CUDA(cudaGetLastError()); +} + +void SynchronizeDevice() +{ + H_CHECK_CUDA(cudaDeviceSynchronize()); +} + +}// namespace gpu + +namespace cuda +{ + +std::string BuildCUDAErrorMessage( + std::string const& cmd, cudaError_t error_code) +{ + std::ostringstream oss; + oss << "CUDA error detected in command: \"" << cmd << "\"\n\n" + << " Error Code: " << error_code << "\n" + << " Error Name: " << cudaGetErrorName(error_code) << "\n" + << " Error Mesg: " << cudaGetErrorString(error_code); + return oss.str(); +} + +cudaEvent_t GetDefaultEvent() noexcept +{ + return gpu::DefaultSyncInfo().Event(); +} + +cudaStream_t GetDefaultStream() noexcept +{ + return gpu::DefaultSyncInfo().Stream(); +} + +cudaStream_t GetNewStream() +{ + cudaStream_t stream; + H_CHECK_CUDA( + cudaStreamCreateWithFlags( + &stream, cudaStreamNonBlocking)); + return stream; +} + +cudaEvent_t GetNewEvent() +{ + cudaEvent_t event; + H_CHECK_CUDA( + cudaEventCreateWithFlags( + &event, cudaEventDisableTiming)); + return event; +} + +void FreeStream(cudaStream_t& stream) +{ + if (stream) + { + H_CHECK_CUDA(cudaStreamDestroy(stream)); + stream = nullptr; + } +} + +void FreeEvent(cudaEvent_t& event) +{ + if (event) + { + H_CHECK_CUDA(cudaEventDestroy(event)); + event = nullptr; + } +} + +}// namespace cuda +}// namespace hydrogen diff --git a/src/hydrogen/device/GPU.cpp b/src/hydrogen/device/GPU.cpp new file mode 100644 index 0000000000..300e39eada --- /dev/null +++ b/src/hydrogen/device/GPU.cpp @@ -0,0 +1,105 @@ +#include "El/hydrogen_config.h" +#include "El/core/MemoryPool.hpp" +#include "hydrogen/device/GPU.hpp" + +#if defined HYDROGEN_HAVE_CUDA +#include "hydrogen/device/gpu/CUDA.hpp" +namespace impl = ::hydrogen::cuda; +#elif defined HYDROGEN_HAVE_ROCM +#include "hydrogen/device/gpu/ROCm.hpp" +namespace impl = ::hydrogen::rocm; +#endif // HYDROGEN_HAVE_CUDA + +#if defined HYDROGEN_HAVE_CUB +#include "hydrogen/device/gpu/CUB.hpp" +#endif + +namespace hydrogen +{ +namespace gpu +{ +namespace +{ + +// Global variables +bool gpu_initialized_ = false; +SyncInfo default_syncinfo_; + +}// namespace + +int ComputeDeviceId(unsigned int device_count) noexcept +{ + if (device_count == 0U) + return -1; + if (device_count == 1U) + return 0; + + // Get local rank (rank within compute node) + // + // TODO: Update to not rely on env vars + // TODO: Use HWLOC or something to pick "closest GPU" + int local_rank = 0; + char* env = nullptr; + if (!env) { env = std::getenv("SLURM_LOCALID"); } + if (!env) { env = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); } + if (!env) { env = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); } + if (env) { local_rank = std::atoi(env); } + + // Try assigning GPUs to local ranks in round-robin fashion + return local_rank % device_count; +} + +void Initialize() +{ + if (gpu_initialized_) + return; // Or should this throw?? + + // This should fail if device < 0. + SetDevice(DefaultDevice()); + + // Setup a default stream and event. + default_syncinfo_ = CreateNewSyncInfo(); + + // Set the global flag + gpu_initialized_ = true; +} + +void Finalize() +{ + // FIXME: This stuff should move. +#ifdef HYDROGEN_HAVE_CUB + cub::DestroyMemoryPool(); +#endif // HYDROGEN_HAVE_CUB + El::DestroyPinnedHostMemoryPool(); + DestroySyncInfo(default_syncinfo_); + gpu_initialized_ = false; +} + +bool IsInitialized() noexcept +{ + return gpu_initialized_; +} + +SyncInfo const& DefaultSyncInfo() noexcept +{ + return default_syncinfo_; +} + +}// namespace gpu + +template <> +SyncInfo CreateNewSyncInfo() +{ + return SyncInfo{ + impl::GetNewStream(), impl::GetNewEvent()}; +} + +void DestroySyncInfo(SyncInfo& si) +{ + impl::FreeStream(si.stream_); + impl::FreeEvent(si.event_); + si.stream_ = nullptr; + si.event_ = nullptr; +} + +}// namespace hydrogen diff --git a/src/hydrogen/device/ROCm.cpp b/src/hydrogen/device/ROCm.cpp new file mode 100644 index 0000000000..db1592184c --- /dev/null +++ b/src/hydrogen/device/ROCm.cpp @@ -0,0 +1,101 @@ +#include + +#include +#include + +#include + +namespace hydrogen +{ +namespace gpu +{ + +size_t DeviceCount() +{ + int count; + H_CHECK_HIP(hipGetDeviceCount(&count)); + return count; +} + +int DefaultDevice() +{ + static int device_id = ComputeDeviceId(DeviceCount()); + return device_id; +} + +int CurrentDevice() +{ + int device_id; + H_CHECK_HIP(hipGetDevice(&device_id)); + return device_id; +} + +void SetDevice(int device_id) +{ + H_CHECK_HIP(hipSetDevice(device_id)); +} + +void SynchronizeDevice() +{ + H_CHECK_HIP(hipDeviceSynchronize()); +} + +}// namespace gpu + +namespace rocm +{ + +std::string BuildHipErrorMessage(std::string const& cmd, hipError_t error_code) +{ + std::ostringstream oss; + oss << "ROCm error detected in command: \"" << cmd << "\"\n\n" + << " Error Code: " << error_code << "\n" + << " Error Name: " << hipGetErrorName(error_code) << "\n" + << " Error Mesg: " << hipGetErrorString(error_code); + return oss.str(); +} + +hipEvent_t GetDefaultEvent() noexcept +{ + return gpu::DefaultSyncInfo().Event(); +} + +hipStream_t GetDefaultStream() noexcept +{ + return gpu::DefaultSyncInfo().Stream(); +} + +hipStream_t GetNewStream() +{ + hipStream_t stream; + H_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + return stream; +} + +hipEvent_t GetNewEvent() +{ + hipEvent_t event; + H_CHECK_HIP(hipEventCreateWithFlags(&event, hipEventDisableTiming)); + return event; +} + +void FreeStream(hipStream_t& stream) +{ + if (stream) + { + H_CHECK_HIP(hipStreamDestroy(stream)); + stream = nullptr; + } +} + +void FreeEvent(hipEvent_t& event) +{ + if (event) + { + H_CHECK_HIP(hipEventDestroy(event)); + event = nullptr; + } +} + +}// namespace rocm +}// namespace hydrogen diff --git a/src/hydrogen/device/cuBLAS.cpp b/src/hydrogen/device/cuBLAS.cpp new file mode 100644 index 0000000000..71133a58af --- /dev/null +++ b/src/hydrogen/device/cuBLAS.cpp @@ -0,0 +1,151 @@ +#include + +// Helper macro for converting enums to strings. +#define H_ADD_CUBLAS_ENUM_TO_STRING_CASE(enum_value) \ + case enum_value: \ + return #enum_value + + +namespace +{ + +std::string GetcuBLASErrorString(cublasStatus_t status) +{ + switch (status) + { + H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_SUCCESS); + H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_NOT_INITIALIZED); + H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_ALLOC_FAILED); + H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_INVALID_VALUE); + H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_ARCH_MISMATCH); + H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_MAPPING_ERROR); + H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_EXECUTION_FAILED); + H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_INTERNAL_ERROR); + H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_NOT_SUPPORTED); + H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_LICENSE_ERROR); + default: + return "Unknown cuBLAS error."; + } +} + +} + +namespace hydrogen +{ +namespace cublas +{ +namespace // +{ +bool cublas_is_initialized_ = false; +cublasHandle_t default_cublas_handle_; +}// namespace + +cublasHandle_t GetLibraryHandle() noexcept +{ + return default_cublas_handle_; +} + +bool IsInitialized() noexcept +{ + return cublas_is_initialized_; +} + +void Initialize(cublasHandle_t handle) +{ + if (!IsInitialized()) + { + if (!handle) + H_CHECK_CUBLAS(cublasCreate(&default_cublas_handle_)); + else + default_cublas_handle_ = handle; + + H_CHECK_CUBLAS( + cublasSetStream( + GetLibraryHandle(), cuda::GetDefaultStream())); + H_CHECK_CUBLAS( + cublasSetPointerMode( + GetLibraryHandle(), CUBLAS_POINTER_MODE_HOST)); +#ifdef HYDROGEN_GPU_USE_TENSOR_OP_MATH + H_CHECK_CUBLAS( + cublasSetMathMode(GetLibraryHandle(),CUBLAS_TENSOR_OP_MATH)); +#else + H_CHECK_CUBLAS( + cublasSetMathMode(GetLibraryHandle(),CUBLAS_DEFAULT_MATH)); +#endif // HYDROGEN_GPU_USE_TENSOR_OP_MATH + + cublas_is_initialized_ = true; + } +} + +void Finalize() +{ + if (default_cublas_handle_) + H_CHECK_CUBLAS(cublasDestroy(default_cublas_handle_)); + default_cublas_handle_ = nullptr; + cublas_is_initialized_ = false; +} + +void ReplaceLibraryHandle(cublasHandle_t handle) +{ + H_ASSERT_FALSE(handle == nullptr, + std::logic_error, + "hydrogen::cublas::ReplaceLibraryHandle(): " + "Detected a null cuBLAS handle."); + + H_ASSERT(IsInitialized(), + std::logic_error, + "hydrogen::cublas::ReplaceLibraryHandle(): " + "cuBLAS must be initialized before calling this function."); + + if (default_cublas_handle_) + H_CHECK_CUBLAS(cublasDestroy(default_cublas_handle_)); + default_cublas_handle_ = handle; +} + +SyncManager::SyncManager(cublasHandle_t handle, + SyncInfo const& si) +{ + H_CHECK_CUBLAS( + cublasGetStream(handle, &orig_stream_)); + H_CHECK_CUBLAS( + cublasSetStream(handle, si.Stream())); +} + +SyncManager::~SyncManager() +{ + try + { + H_CHECK_CUBLAS( + cublasSetStream( + GetLibraryHandle(), orig_stream_)); + } + catch (std::exception const& e) + { + H_REPORT_DTOR_EXCEPTION_AND_TERMINATE(e); + } +} + +std::string BuildcuBLASErrorMessage( + std::string const& cmd, cublasStatus_t error_code) +{ + std::ostringstream oss; + oss << "cuBLAS error detected in command: \"" << cmd << "\"\n\n" + << " Error Code: " << error_code << "\n" + << " Error Name: " << GetcuBLASErrorString(error_code); + return oss.str(); +} + +}// namespace cublas + +namespace gpu_blas +{ +void SetPointerMode(PointerMode mode) +{ + H_CHECK_CUBLAS( + cublasSetPointerMode(cublas::GetLibraryHandle(), + (mode == PointerMode::HOST + ? CUBLAS_POINTER_MODE_HOST + : CUBLAS_POINTER_MODE_DEVICE))); +} +}// namespace gpu_blas +}// namespace hydrogen diff --git a/src/core/imports/cublas.cpp b/src/hydrogen/device/cuBLAS_API.cpp similarity index 58% rename from src/core/imports/cublas.cpp rename to src/hydrogen/device/cuBLAS_API.cpp index 47f7af8cd9..0f5d457b72 100644 --- a/src/core/imports/cublas.cpp +++ b/src/hydrogen/device/cuBLAS_API.cpp @@ -3,9 +3,11 @@ #include #include + #ifdef HYDROGEN_GPU_USE_FP16 #include #endif // HYDROGEN_GPU_USE_FP16 + #include namespace hydrogen @@ -13,34 +15,6 @@ namespace hydrogen namespace cublas { -cublasHandle_t GetLibraryHandle() noexcept -{ - return GPUManager::cuBLASHandle(); -} - -void Initialize() -{ - GPUManager::InitializeCUBLAS(); -#ifdef HYDROGEN_CUBLAS_USE_TENSOR_OP_MATH - H_CHECK_CUBLAS( - cublasSetMathMode(GetLibraryHandle(), CUBLAS_TENSOR_OP_MATH)); -#endif // HYDROGEN_CUBLAS_USE_TENSOR_OP_MATH -} - -SyncManager::SyncManager(cublasHandle_t handle, - SyncInfo const& si) -{ - H_CHECK_CUBLAS( - cublasGetStream(handle, &orig_stream_)); - H_CHECK_CUBLAS( - cublasSetStream(handle, si.stream_)); -} - -SyncManager::~SyncManager() -{ - cublasSetStream(GPUManager::cuBLASHandle(), orig_stream_); -} - // // BLAS 1 // @@ -61,6 +35,38 @@ void Axpy(cublasHandle_t handle, CUDA_R_32F)); } +void Dot(cublasHandle_t handle, + int n, + __half const* X, int incx, + __half const* Y, int incy, + __half& output) +{ + H_CHECK_CUBLAS( + cublasDotEx( + handle, + n, + X, /*xtype=*/CUDA_R_16F, incx, + Y, /*ytype=*/CUDA_R_16F, incy, + &output, + /*resulttype=*/CUDA_R_16F, + /*executiontype=*/CUDA_R_32F)); +} + +void Nrm2(cublasHandle_t handle, + int n, + __half const* X, int incx, + __half& output) +{ + H_CHECK_CUBLAS( + cublasNrm2Ex( + handle, + n, + X, /*xtype=*/CUDA_R_16F, incx, + &output, + /*resulttype=*/CUDA_R_16F, + /*executiontype=*/CUDA_R_32F)); +} + void Scale(cublasHandle_t handle, int n, __half const& alpha, __half* X, int incx) @@ -94,6 +100,72 @@ void Scale(cublasHandle_t handle, n, X, incx, Y, incy)); \ } +#define ADD_DOT_IMPL(ScalarType, TypeChar) \ + void Dot(cublasHandle_t handle, \ + int n, ScalarType const* X, int incx, \ + ScalarType const* Y, int incy, \ + ScalarType* output) \ + { \ + H_CHECK_CUBLAS( \ + cublas ## TypeChar ## dot( \ + handle, \ + n, X, incx, Y, incy, output)); \ + } + +template +struct RealTypeT +{ + using type = T; +}; + +template <> +struct RealTypeT +{ + using type = float; +}; + +template <> +struct RealTypeT +{ + using type = double; +}; + +template +using RealType = typename RealTypeT::type; + +#define ADD_COMPLEX_DOT_IMPL(ScalarType, TypeChar) \ + void Dotu(cublasHandle_t handle, \ + int n, ScalarType const* X, int incx, \ + ScalarType const* Y, int incy, \ + ScalarType* output) \ + { \ + H_CHECK_CUBLAS( \ + cublas ## TypeChar ## dotu( \ + handle, \ + n, X, incx, Y, incy, output)); \ + } \ + void Dotc(cublasHandle_t handle, \ + int n, ScalarType const* X, int incx, \ + ScalarType const* Y, int incy, \ + ScalarType* output) \ + { \ + H_CHECK_CUBLAS( \ + cublas ## TypeChar ## dotc( \ + handle, \ + n, X, incx, Y, incy, output)); \ + } + +#define ADD_NRM2_IMPL(ScalarType, TypeChar) \ + void Nrm2(cublasHandle_t handle, \ + int n, ScalarType const* X, int incx, \ + ScalarType const* Y, int incy, \ + RealType* output) \ + { \ + H_CHECK_CUBLAS( \ + cublas ## TypeChar ## nrm2( \ + handle, n, X, incx, output)); \ + } + #define ADD_SCALE_IMPL(ScalarType, TypeChar) \ void Scale(cublasHandle_t handle, \ int n, ScalarType const& alpha, \ @@ -140,14 +212,41 @@ void Scale(cublasHandle_t handle, ScalarType const& beta, \ ScalarType* C, int ldc) \ { \ - H_CHECK_CUBLAS( \ + H_CHECK_CUBLAS( \ cublas ## TypeChar ## gemm( \ - handle, \ - transpA, transpB, \ + handle, \ + transpA, transpB, \ m, n, k, &alpha, A, lda, B, ldb, \ &beta, C, ldc)); \ } +#define ADD_GEMM_STRIDED_BATCHED_IMPL(ScalarType, TypeChar) \ + void GemmStridedBatched( \ + cublasHandle_t handle, \ + cublasOperation_t transpA, \ + cublasOperation_t transpB, \ + int m, int n, int k, \ + ScalarType const* alpha, \ + ScalarType const* A, int lda, \ + long long int strideA, \ + ScalarType const* B, int ldb, \ + long long int strideB, \ + ScalarType const* beta, \ + ScalarType* C, int ldc, \ + long long int strideC, \ + int batchCount) \ + { \ + H_CHECK_CUBLAS( \ + cublas ## TypeChar ## gemmStridedBatched( \ + handle, transpA, transpB, \ + m, n, k, \ + alpha, \ + A, lda, strideA, B, ldb, strideB, \ + beta, \ + C, ldc, strideC, \ + batchCount)); \ + } + // // BLAS-like Extension // @@ -163,7 +262,7 @@ void Scale(cublasHandle_t handle, ScalarType const* B, int ldb, \ ScalarType* C, int ldc) \ { \ - H_CHECK_CUBLAS( \ + H_CHECK_CUBLAS( \ cublas ## TypeChar ## geam( \ handle, \ transpA, transpB, \ @@ -182,7 +281,7 @@ void Scale(cublasHandle_t handle, ScalarType const* X, int incx, \ ScalarType* C, int ldc) \ { \ - H_CHECK_CUBLAS( \ + H_CHECK_CUBLAS( \ cublas ## TypeChar ## dgmm( \ handle, \ side, m, n, A, lda, X, incx, C, ldc)); \ @@ -199,6 +298,16 @@ ADD_COPY_IMPL(double, D) ADD_COPY_IMPL(cuComplex, C) ADD_COPY_IMPL(cuDoubleComplex, Z) +ADD_DOT_IMPL(float, S) +ADD_DOT_IMPL(double, D) +ADD_COMPLEX_DOT_IMPL(cuComplex, C) +ADD_COMPLEX_DOT_IMPL(cuDoubleComplex, Z) + +ADD_NRM2_IMPL(float, S) +ADD_NRM2_IMPL(double, D) +ADD_NRM2_IMPL(cuComplex, Sc) +ADD_NRM2_IMPL(cuDoubleComplex, Dz) + ADD_SCALE_IMPL(float, S) ADD_SCALE_IMPL(double, D) ADD_SCALE_IMPL(cuComplex, C) @@ -217,6 +326,12 @@ ADD_GEMM_IMPL(double, D) ADD_GEMM_IMPL(cuComplex, C) ADD_GEMM_IMPL(cuDoubleComplex, Z) +ADD_GEMM_STRIDED_BATCHED_IMPL(__half, H) +ADD_GEMM_STRIDED_BATCHED_IMPL(float, S) +ADD_GEMM_STRIDED_BATCHED_IMPL(double, D) +ADD_GEMM_STRIDED_BATCHED_IMPL(cuComplex, C) +ADD_GEMM_STRIDED_BATCHED_IMPL(cuDoubleComplex, Z) + // BLAS-like extension ADD_GEAM_IMPL(float, S) ADD_GEAM_IMPL(double, D) @@ -241,38 +356,53 @@ ADD_DGMM_IMPL(cuDoubleComplex, Z) ASSERT_SUPPORT(float, BLAS_Op::AXPY); ASSERT_SUPPORT(float, BLAS_Op::COPY); ASSERT_SUPPORT(float, BLAS_Op::DGMM); +ASSERT_SUPPORT(float, BLAS_Op::DOT); ASSERT_SUPPORT(float, BLAS_Op::GEAM); ASSERT_SUPPORT(float, BLAS_Op::GEMM); +ASSERT_SUPPORT(float, BLAS_Op::GEMMSTRIDEDBATCHED); ASSERT_SUPPORT(float, BLAS_Op::GEMV); +ASSERT_SUPPORT(float, BLAS_Op::NRM2); ASSERT_SUPPORT(float, BLAS_Op::SCAL); ASSERT_SUPPORT(double, BLAS_Op::AXPY); ASSERT_SUPPORT(double, BLAS_Op::COPY); ASSERT_SUPPORT(double, BLAS_Op::DGMM); +ASSERT_SUPPORT(double, BLAS_Op::DOT); ASSERT_SUPPORT(double, BLAS_Op::GEAM); ASSERT_SUPPORT(double, BLAS_Op::GEMM); +ASSERT_SUPPORT(double, BLAS_Op::GEMMSTRIDEDBATCHED); ASSERT_SUPPORT(double, BLAS_Op::GEMV); +ASSERT_SUPPORT(double, BLAS_Op::NRM2); ASSERT_SUPPORT(double, BLAS_Op::SCAL); ASSERT_SUPPORT(std::complex, BLAS_Op::AXPY); ASSERT_SUPPORT(std::complex, BLAS_Op::COPY); ASSERT_SUPPORT(std::complex, BLAS_Op::DGMM); +ASSERT_SUPPORT(std::complex, BLAS_Op::DOT); ASSERT_SUPPORT(std::complex, BLAS_Op::GEAM); ASSERT_SUPPORT(std::complex, BLAS_Op::GEMM); +ASSERT_SUPPORT(std::complex, BLAS_Op::GEMMSTRIDEDBATCHED); ASSERT_SUPPORT(std::complex, BLAS_Op::GEMV); +ASSERT_SUPPORT(std::complex, BLAS_Op::NRM2); ASSERT_SUPPORT(std::complex, BLAS_Op::SCAL); ASSERT_SUPPORT(std::complex, BLAS_Op::AXPY); ASSERT_SUPPORT(std::complex, BLAS_Op::COPY); ASSERT_SUPPORT(std::complex, BLAS_Op::DGMM); +ASSERT_SUPPORT(std::complex, BLAS_Op::DOT); ASSERT_SUPPORT(std::complex, BLAS_Op::GEAM); ASSERT_SUPPORT(std::complex, BLAS_Op::GEMM); +ASSERT_SUPPORT(std::complex, BLAS_Op::GEMMSTRIDEDBATCHED); ASSERT_SUPPORT(std::complex, BLAS_Op::GEMV); +ASSERT_SUPPORT(std::complex, BLAS_Op::NRM2); ASSERT_SUPPORT(std::complex, BLAS_Op::SCAL); #ifdef HYDROGEN_GPU_USE_FP16 ASSERT_SUPPORT(__half, BLAS_Op::AXPY); +ASSERT_SUPPORT(__half, BLAS_Op::DOT); ASSERT_SUPPORT(__half, BLAS_Op::GEMM); +ASSERT_SUPPORT(__half, BLAS_Op::GEMMSTRIDEDBATCHED); +ASSERT_SUPPORT(__half, BLAS_Op::NRM2); ASSERT_SUPPORT(__half, BLAS_Op::SCAL); ASSERT_NO_SUPPORT(__half, BLAS_Op::COPY); ASSERT_NO_SUPPORT(__half, BLAS_Op::DGMM); @@ -294,9 +424,12 @@ ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::GEMV); ASSERT_NO_SUPPORT(int, BLAS_Op::AXPY); ASSERT_NO_SUPPORT(int, BLAS_Op::COPY); ASSERT_NO_SUPPORT(int, BLAS_Op::DGMM); +ASSERT_NO_SUPPORT(int, BLAS_Op::DOT); ASSERT_NO_SUPPORT(int, BLAS_Op::GEAM); ASSERT_NO_SUPPORT(int, BLAS_Op::GEMM); +ASSERT_NO_SUPPORT(int, BLAS_Op::GEMMSTRIDEDBATCHED); ASSERT_NO_SUPPORT(int, BLAS_Op::GEMV); +ASSERT_NO_SUPPORT(int, BLAS_Op::NRM2); } // namespace cublas diff --git a/src/hydrogen/device/rocBLAS.cpp b/src/hydrogen/device/rocBLAS.cpp new file mode 100644 index 0000000000..2904ea1b48 --- /dev/null +++ b/src/hydrogen/device/rocBLAS.cpp @@ -0,0 +1,143 @@ +#include + +// Helper macro for converting enums to strings. +#define H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(enum_value) \ + case enum_value: \ + return #enum_value + + +namespace +{ + +std::string GetrocBLASErrorString(rocblas_status status) +{ + switch (status) + { + H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_success); + H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_invalid_handle); + H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_not_implemented); + H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_invalid_pointer); + H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_invalid_size); + H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_memory_error); + H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_internal_error); + default: + return "Unknown rocBLAS error."; + } +} + +} + +namespace hydrogen +{ + +namespace rocblas +{ +namespace // +{ +bool rocblas_is_initialized_ = false; +rocblas_handle default_rocblas_handle_; +}// namespace + +rocblas_handle GetLibraryHandle() noexcept +{ + return default_rocblas_handle_; +} + +bool IsInitialized() noexcept +{ + return rocblas_is_initialized_; +} + +void Initialize(rocblas_handle handle) +{ + if (!IsInitialized()) + { + if (!handle) + H_CHECK_ROCBLAS(rocblas_create_handle(&default_rocblas_handle_)); + else + default_rocblas_handle_ = handle; + + H_CHECK_ROCBLAS( + rocblas_set_stream( + GetLibraryHandle(), rocm::GetDefaultStream())); + H_CHECK_ROCBLAS( + rocblas_set_pointer_mode( + GetLibraryHandle(), rocblas_pointer_mode_host)); + + rocblas_is_initialized_ = true; + } +} + +void Finalize() +{ + if (default_rocblas_handle_) + H_CHECK_ROCBLAS(rocblas_destroy_handle(default_rocblas_handle_)); + default_rocblas_handle_ = nullptr; + rocblas_is_initialized_ = false; +} + +void ReplaceLibraryHandle(rocblas_handle handle) +{ + H_ASSERT_FALSE(handle == nullptr, + std::logic_error, + "hydrogen::rocblas::ReplaceLibraryHandle(): " + "Detected a null rocBLAS handle."); + + H_ASSERT(IsInitialized(), + std::logic_error, + "hydrogen::rocblas::ReplaceLibraryHandle(): " + "rocBLAS must be initialized before calling this function."); + + if (default_rocblas_handle_) + H_CHECK_ROCBLAS(rocblas_destroy_handle(default_rocblas_handle_)); + default_rocblas_handle_ = handle; +} + +SyncManager::SyncManager(rocblas_handle handle, + SyncInfo const& si) +{ + H_CHECK_ROCBLAS( + rocblas_get_stream(handle, &orig_stream_)); + H_CHECK_ROCBLAS( + rocblas_set_stream(handle, si.Stream())); +} + +SyncManager::~SyncManager() +{ + try + { + H_CHECK_ROCBLAS( + rocblas_set_stream( + GetLibraryHandle(), orig_stream_)); + } + catch (std::exception const& e) + { + H_REPORT_DTOR_EXCEPTION_AND_TERMINATE(e); + } +} + +std::string BuildrocBLASErrorMessage( + std::string const& cmd, rocblas_status error_code) +{ + std::ostringstream oss; + oss << "rocBLAS error detected in command: \"" << cmd << "\"\n\n" + << " Error Code: " << error_code << "\n" + << " Error Name: " << GetrocBLASErrorString(error_code); + return oss.str(); +} + +}// namespace rocblas + +namespace gpu_blas +{ +void SetPointerMode(PointerMode mode) +{ + H_CHECK_ROCBLAS( + rocblas_set_pointer_mode(rocblas::GetLibraryHandle(), + (mode == PointerMode::HOST + ? rocblas_pointer_mode_host + : rocblas_pointer_mode_device))); +} +}// namespace gpu_blas + +}// namespace hydrogen diff --git a/src/hydrogen/device/rocBLAS_API.cpp b/src/hydrogen/device/rocBLAS_API.cpp new file mode 100644 index 0000000000..3f2f4e6ce0 --- /dev/null +++ b/src/hydrogen/device/rocBLAS_API.cpp @@ -0,0 +1,285 @@ +#include + +#include +#include + +#include + +namespace hydrogen +{ +namespace rocblas +{ + +// +// BLAS 1 +// + +#define ADD_AXPY_IMPL(ScalarType, TypeChar) \ + void Axpy(rocblas_handle handle, \ + int n, ScalarType const& alpha, \ + ScalarType const* X, int incx, \ + ScalarType* Y, int incy) \ + { \ + H_CHECK_ROCBLAS( \ + rocblas_ ## TypeChar ## axpy( \ + handle, \ + n, &alpha, X, incx, Y, incy)); \ + } + +#define ADD_COPY_IMPL(ScalarType, TypeChar) \ + void Copy(rocblas_handle handle, \ + int n, ScalarType const* X, int incx, \ + ScalarType* Y, int incy) \ + { \ + H_CHECK_ROCBLAS( \ + rocblas_ ## TypeChar ## copy( \ + handle, \ + n, X, incx, Y, incy)); \ + } + +#define ADD_DOT_IMPL(ScalarType, TypeChar) \ + void Dot(rocblas_handle handle, \ + int n, \ + ScalarType const* X, int incx, \ + ScalarType const* Y, int incy, \ + ScalarType* result) \ + { \ + H_CHECK_ROCBLAS( \ + rocblas_ ## TypeChar ## dot( \ + handle, \ + n, X, incx, Y, incy, result)); \ + } + +#define ADD_NRM2_IMPL(ScalarType, TypeChar) \ + void Nrm2(rocblas_handle handle, \ + int n, ScalarType const* X, int incx, \ + ScalarType* result) \ + { \ + H_CHECK_ROCBLAS( \ + rocblas_ ## TypeChar ## nrm2( \ + handle, \ + n, X, incx, result)); \ + } + +#define ADD_SCALE_IMPL(ScalarType, TypeChar) \ + void Scale(rocblas_handle handle, \ + int n, ScalarType const& alpha, \ + ScalarType* X, int incx) \ + { \ + H_CHECK_ROCBLAS( \ + rocblas_ ## TypeChar ## scal( \ + handle, n, &alpha, X, incx)); \ + } + +// +// BLAS 2 +// +#define ADD_GEMV_IMPL(ScalarType, TypeChar) \ + void Gemv( \ + rocblas_handle handle, \ + rocblas_operation transpA, int m, int n, \ + ScalarType const& alpha, \ + ScalarType const* A, int lda, \ + ScalarType const* B, int ldb, \ + ScalarType const& beta, \ + ScalarType* C, int ldc) \ + { \ + H_CHECK_ROCBLAS(rocblas_ ## TypeChar ## gemv( \ + handle, \ + transpA, \ + m, n, \ + &alpha, A, lda, B, ldb, \ + &beta, C, ldc)); \ + } + +// +// BLAS 3 +// +#define ADD_GEMM_IMPL(ScalarType, TypeChar) \ + void Gemm( \ + rocblas_handle handle, \ + rocblas_operation transpA, \ + rocblas_operation transpB, \ + rocblas_int m, rocblas_int n, rocblas_int k, \ + ScalarType const& alpha, \ + ScalarType const* A, rocblas_int lda, \ + ScalarType const* B, rocblas_int ldb, \ + ScalarType const& beta, \ + ScalarType* C, rocblas_int ldc) \ + { \ + H_CHECK_ROCBLAS( \ + rocblas_ ## TypeChar ## gemm( \ + handle, \ + transpA, transpB, \ + m, n, k, &alpha, A, lda, B, ldb, \ + &beta, C, ldc)); \ + } + +#define ADD_GEMM_STRIDED_BATCHED_IMPL(ScalarType, TypeChar) \ + void GemmStridedBatched( \ + rocblas_handle handle, \ + rocblas_operation transpA, \ + rocblas_operation transpB, \ + rocblas_int m, rocblas_int n, rocblas_int k, \ + ScalarType const& alpha, \ + ScalarType const* A, rocblas_int lda, rocblas_stride strideA, \ + ScalarType const* B, rocblas_int ldb, rocblas_stride strideB, \ + ScalarType const& beta, \ + ScalarType* C, rocblas_int ldc, rocblas_stride strideC, \ + rocblas_int batchCount) \ + { \ + H_CHECK_ROCBLAS( \ + rocblas_ ## TypeChar ## gemm_strided_batched( \ + handle, \ + transpA, transpB, \ + m, n, k, &alpha, \ + A, lda, strideA, \ + B, ldb, strideB, \ + &beta, C, ldc, strideC, batchCount)); \ + } + +// +// BLAS-like Extension +// +#define ADD_GEAM_IMPL(ScalarType, TypeChar) \ + void Geam( \ + rocblas_handle handle, \ + rocblas_operation transpA, \ + rocblas_operation transpB, \ + int m, int n, \ + ScalarType const& alpha, \ + ScalarType const* A, int lda, \ + ScalarType const& beta, \ + ScalarType const* B, int ldb, \ + ScalarType* C, int ldc) \ + { \ + H_CHECK_ROCBLAS( \ + rocblas_ ## TypeChar ## geam( \ + handle, \ + transpA, transpB, \ + m, n, \ + &alpha, A, lda, \ + &beta, B, ldb, \ + C, ldc)); \ + } + +#define ADD_DGMM_IMPL(ScalarType, TypeChar) \ + void Dgmm( \ + rocblas_handle handle, \ + rocblas_side side, \ + int m, int n, \ + ScalarType const* A, int lda, \ + ScalarType const* X, int incx, \ + ScalarType* C, int ldc) \ + { \ + H_CHECK_ROCBLAS(rocblas_status_not_implemented); \ + } + +// BLAS 1 +ADD_AXPY_IMPL(rocblas_half, h) +ADD_AXPY_IMPL(float, s) +ADD_AXPY_IMPL(double, d) + +ADD_COPY_IMPL(float, s) +ADD_COPY_IMPL(double, d) + +//ADD_DOT_IMPL(rocblas_half, h) +ADD_DOT_IMPL(float, s) +ADD_DOT_IMPL(double, d) + +//ADD_NRM2_IMPL(rocblas_half, h) +ADD_NRM2_IMPL(float, s) +ADD_NRM2_IMPL(double, d) + +ADD_SCALE_IMPL(float, s) +ADD_SCALE_IMPL(double, d) + +// BLAS 2 +ADD_GEMV_IMPL(float, s) +ADD_GEMV_IMPL(double, d) + +// BLAS 3 +ADD_GEMM_IMPL(rocblas_half, h) +ADD_GEMM_IMPL(float, s) +ADD_GEMM_IMPL(double, d) + +ADD_GEMM_STRIDED_BATCHED_IMPL(rocblas_half, h) +ADD_GEMM_STRIDED_BATCHED_IMPL(float, s) +ADD_GEMM_STRIDED_BATCHED_IMPL(double, d) + +// BLAS-like extension +ADD_GEAM_IMPL(float, s) +ADD_GEAM_IMPL(double, d) + +ADD_DGMM_IMPL(float, s) +ADD_DGMM_IMPL(double, d) + +// +// "STATIC" UNIT TEST +// + +#define ASSERT_SUPPORT(type, op) \ + static_assert(IsSupportedType::value, "") + +#define ASSERT_NO_SUPPORT(type, op) \ + static_assert(!IsSupportedType::value, "") + +ASSERT_SUPPORT(float, BLAS_Op::AXPY); +ASSERT_SUPPORT(float, BLAS_Op::COPY); +ASSERT_SUPPORT(float, BLAS_Op::GEAM); +ASSERT_SUPPORT(float, BLAS_Op::GEMM); +ASSERT_SUPPORT(float, BLAS_Op::GEMV); +ASSERT_SUPPORT(float, BLAS_Op::SCAL); +ASSERT_NO_SUPPORT(float, BLAS_Op::DGMM); +ASSERT_SUPPORT(float, BLAS_Op::DOT); +ASSERT_SUPPORT(float, BLAS_Op::NRM2); +ASSERT_SUPPORT(float, BLAS_Op::GEMMSTRIDEDBATCHED); + +ASSERT_SUPPORT(double, BLAS_Op::AXPY); +ASSERT_SUPPORT(double, BLAS_Op::COPY); +ASSERT_SUPPORT(double, BLAS_Op::GEAM); +ASSERT_SUPPORT(double, BLAS_Op::GEMM); +ASSERT_SUPPORT(double, BLAS_Op::GEMV); +ASSERT_SUPPORT(double, BLAS_Op::SCAL); +ASSERT_NO_SUPPORT(double, BLAS_Op::DGMM); +ASSERT_SUPPORT(double, BLAS_Op::DOT); +ASSERT_SUPPORT(double, BLAS_Op::NRM2); +ASSERT_SUPPORT(double, BLAS_Op::GEMMSTRIDEDBATCHED); + +#ifdef HYDROGEN_GPU_USE_FP16 +ASSERT_SUPPORT(rocblas_half, BLAS_Op::AXPY); +ASSERT_SUPPORT(rocblas_half, BLAS_Op::GEMM); +ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::SCAL); +ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::COPY); +ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::DGMM); +ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::GEAM); +ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::GEMV); +ASSERT_SUPPORT(rocblas_half, BLAS_Op::DOT); +ASSERT_SUPPORT(rocblas_half, BLAS_Op::NRM2); +ASSERT_SUPPORT(rocblas_half, BLAS_Op::GEMMSTRIDEDBATCHED); + +#ifdef HYDROGEN_HAVE_HALF +ASSERT_SUPPORT(cpu_half_type, BLAS_Op::AXPY); +ASSERT_SUPPORT(cpu_half_type, BLAS_Op::GEMM); +ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::SCAL); +ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::COPY); +ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::DGMM); +ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::GEAM); +ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::GEMV); +#endif // HYDROGEN_HAVE_HALF +#endif // HYDROGEN_GPU_USE_FP16 + +// One type that should be entirely unsupported, just for sanity. +ASSERT_NO_SUPPORT(int, BLAS_Op::AXPY); +ASSERT_NO_SUPPORT(int, BLAS_Op::COPY); +ASSERT_NO_SUPPORT(int, BLAS_Op::DGMM); +ASSERT_NO_SUPPORT(int, BLAS_Op::GEAM); +ASSERT_NO_SUPPORT(int, BLAS_Op::GEMM); +ASSERT_NO_SUPPORT(int, BLAS_Op::GEMV); +ASSERT_NO_SUPPORT(int, BLAS_Op::SCAL); +ASSERT_NO_SUPPORT(int, BLAS_Op::DOT); +ASSERT_NO_SUPPORT(int, BLAS_Op::NRM2); +ASSERT_NO_SUPPORT(int, BLAS_Op::GEMMSTRIDEDBATCHED); + +} // namespace rocblas +} // namespace hydrogen diff --git a/src/io/Display.cpp b/src/io/Display.cpp index 775f3a6b3a..7957c12c28 100644 --- a/src/io/Display.cpp +++ b/src/io/Display.cpp @@ -32,7 +32,7 @@ void Display(AbstractMatrix const& A, std::string title) case Device::CPU: Display(static_cast const&>(A), title); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: { // Copy to the CPU @@ -41,7 +41,7 @@ void Display(AbstractMatrix const& A, std::string title) Display(A_CPU, title); } break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Display: Bad Device type."); } diff --git a/src/io/Print.cpp b/src/io/Print.cpp index 60e8d7a8de..c07e0dac2f 100644 --- a/src/io/Print.cpp +++ b/src/io/Print.cpp @@ -29,7 +29,7 @@ void Print(AbstractMatrix const& A, string title, ostream& os) case Device::CPU: Print(static_cast const&>(A), title, os); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: { // Copy to host @@ -38,7 +38,7 @@ void Print(AbstractMatrix const& A, string title, ostream& os) Print(A_CPU, title, os); } break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Print: Bad device."); } diff --git a/src/io/Read.cpp b/src/io/Read.cpp index 8762f6d1b9..3484e0a8d1 100644 --- a/src/io/Read.cpp +++ b/src/io/Read.cpp @@ -25,7 +25,7 @@ void Read(AbstractMatrix& A, case Device::CPU: Read(static_cast&>(A), filename, format); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: { Matrix A_CPU; @@ -33,7 +33,7 @@ void Read(AbstractMatrix& A, static_cast&>(A) = A_CPU; } break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Read: Bad device type."); } diff --git a/src/io/Write.cpp b/src/io/Write.cpp index 7399e25746..415132a3fe 100644 --- a/src/io/Write.cpp +++ b/src/io/Write.cpp @@ -27,7 +27,7 @@ void Write(AbstractMatrix const& A, string basename, Write(static_cast const&>(A), basename, format, title); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: { // Copy to the CPU @@ -35,7 +35,7 @@ void Write(AbstractMatrix const& A, string basename, Write(A_CPU, basename, format, title); } break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("Write: Bad Device type."); } diff --git a/src/lapack_like/props/Norm/Frobenius.cpp b/src/lapack_like/props/Norm/Frobenius.cpp index dc84bd68a7..fd187b2260 100644 --- a/src/lapack_like/props/Norm/Frobenius.cpp +++ b/src/lapack_like/props/Norm/Frobenius.cpp @@ -17,13 +17,13 @@ Base FrobeniusNorm(AbstractMatrix const& A) { case Device::CPU: return FrobeniusNorm(static_cast const&>(A)); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: { AbstractMatrixReadDeviceProxy ALocProxy{A}; return FrobeniusNorm(ALocProxy.GetLocked()); } -#endif //HYDROGEN_HAVE_CUDA +#endif //HYDROGEN_HAVE_GPU default: LogicError("FrobeniusNorm: Bad Device."); } diff --git a/src/matrices/random/independent/Gaussian.cpp b/src/matrices/random/independent/Gaussian.cpp index 6584744108..c64cd14a37 100644 --- a/src/matrices/random/independent/Gaussian.cpp +++ b/src/matrices/random/independent/Gaussian.cpp @@ -37,11 +37,11 @@ void MakeGaussian(AbstractMatrix& A, F mean, Base stddev) case Device::CPU: MakeGaussian(static_cast&>(A), mean, stddev); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: MakeGaussian(static_cast&>(A), mean, stddev); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("MakeGaussian: Bad device."); } diff --git a/src/matrices/random/independent/Uniform.cpp b/src/matrices/random/independent/Uniform.cpp index 45ec20d8fe..a0d0937334 100644 --- a/src/matrices/random/independent/Uniform.cpp +++ b/src/matrices/random/independent/Uniform.cpp @@ -23,11 +23,11 @@ void MakeUniform( AbstractMatrix& A, T center, Base radius ) case Device::CPU: MakeUniform(static_cast&>(A), center, radius); break; -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU case Device::GPU: MakeUniform(static_cast&>(A), center, radius); break; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU default: LogicError("MakeUniform: Bad device."); } @@ -82,7 +82,7 @@ void Uniform( AbstractDistMatrix& A, Int m, Int n, T center, Base radius ) template void MakeUniform( \ Matrix& A, T center, Base radius ); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU template void MakeUniform(Matrix&, float, Base); template void MakeUniform(Matrix&, double, Base); @@ -91,7 +91,7 @@ ABSTRACT_PROTO(gpu_half_type); template void MakeUniform(Matrix&, gpu_half_type, Base); #endif -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU #define EL_ENABLE_DOUBLEDOUBLE #define EL_ENABLE_QUADDOUBLE diff --git a/tests/blas_like/Axpy.cpp b/tests/blas_like/Axpy.cpp index d24e977565..22b5ecf91b 100644 --- a/tests/blas_like/Axpy.cpp +++ b/tests/blas_like/Axpy.cpp @@ -181,7 +181,7 @@ main(int argc, char* argv[]) // Message OutputFromRoot(g.Comm(),"Testing Axpy"); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU if (testGPU) { TestAxpy( @@ -191,7 +191,7 @@ main(int argc, char* argv[]) } #else (void)testGPU; -#endif // HYDROGEN_HAVE_CUDA +#endif // HYDROGEN_HAVE_GPU // Run tests if (testCPU) { diff --git a/tests/blas_like/BasicGemm.cpp b/tests/blas_like/BasicGemm.cpp index cb58cde052..a1db01d210 100644 --- a/tests/blas_like/BasicGemm.cpp +++ b/tests/blas_like/BasicGemm.cpp @@ -275,7 +275,7 @@ int main(int argc, char *argv[]) El::Output("grid is ",grid.Height()," x ",grid.Width()); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU if (testGPU) { TestGemm @@ -285,7 +285,7 @@ int main(int argc, char *argv[]) } #else (void) testGPU; -#endif // HYDROGEN_ENABLE_CUDA +#endif // HYDROGEN_HAVE_GPU TestGemm (m, n, k, grid, testSequential, instrument); diff --git a/tests/blas_like/Gemm.cpp b/tests/blas_like/Gemm.cpp index 9324637183..bf18231e9e 100644 --- a/tests/blas_like/Gemm.cpp +++ b/tests/blas_like/Gemm.cpp @@ -7,6 +7,9 @@ http://opensource.org/licenses/BSD-2-Clause */ #include + +#include "GemmHelpers/SyncTimer.hpp" + using namespace El; template @@ -45,35 +48,6 @@ void TestAssociativity EFrobNorm, "/", YFrobNorm, "=", EFrobNorm/YFrobNorm); } -#ifdef HYDROGEN_HAVE_CUDA -#define START_CUDA_TIMER \ - if (D == Device::GPU) \ - cudaEventRecord(start, GPUManager::Stream()); - -#define STOP_CUDA_TIMER \ - if (D == Device::GPU) \ - { \ - cudaEventRecord(stop, GPUManager::Stream()); \ - cudaEventSynchronize(stop); \ - cudaEventElapsedTime(&cudaTime, start, stop); \ - } - -#define SUMMARIZE_CUDA_TIMER \ - if (D == Device::GPU) \ - { \ - runTime = cudaTime * 1e-3; \ - realGFlops = 2.*double(m)*double(n)*double(k)/(1.e9*runTime); \ - gFlops = (IsComplex::value ? 4*realGFlops : realGFlops); \ - OutputFromRoot(g.Comm(),"Finished in ",runTime, \ - " seconds (",gFlops," GFlop/s)"); \ - } - -#else -#define START_CUDA_TIMER do {} while (false) -#define STOP_CUDA_TIMER do {} while (false) -#define SUMMARIZE_CUDA_TIMER do {} while (false) -#endif - template void TestGemm (Orientation orientA, @@ -106,9 +80,9 @@ void TestGemm Gaussian(B, n, k); Gaussian(COrig, m, n); -#ifdef HYDROGEN_HAVE_CUDA - H_CHECK_CUDA(cudaDeviceSynchronize()); -#endif // HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU + El::gpu::SynchronizeDevice(); +#endif // HYDROGEN_HAVE_GPU if (print) { @@ -117,14 +91,11 @@ void TestGemm Print(COrig, "COrig"); } - Timer timer; -#ifdef HYDROGEN_HAVE_CUDA - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); + helpers::SyncTimer timer(SyncInfoFromMatrix(C.LockedMatrix())); float cudaTime; // Warmup run -- doesn't matter in CPU land +#ifdef HYDROGEN_HAVE_GPU if (D == Device::GPU) { C = COrig; @@ -139,27 +110,26 @@ void TestGemm C = COrig; OutputFromRoot(g.Comm(),"Stationary A algorithm:"); PushIndent(); + timer.Reset(); mpi::Barrier(g.Comm()); timer.Start(); - START_CUDA_TIMER; - Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_A_MS); - STOP_CUDA_TIMER; - + Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_A); mpi::Barrier(g.Comm()); - runTime = timer.Stop(); + timer.Stop(); + runTime = timer.GetTime(); realGFlops = 2.*double(m)*double(n)*double(k)/(1.e9*runTime); gFlops = (IsComplex::value ? 4*realGFlops : realGFlops); - if (D == Device::CPU) - OutputFromRoot - (g.Comm(),"Finished in ",runTime," seconds (",gFlops," GFlop/s)"); - SUMMARIZE_CUDA_TIMER; + OutputFromRoot( + g.Comm(),"Finished in ",runTime," seconds (",gFlops," GFlop/s)"); flush(std::cout); if (print) Print(C, BuildString("C := ",alpha," A B + ",beta," C")); if (correctness) - TestAssociativity(orientA, orientB, alpha, A, B, beta, COrig, C, print); + TestAssociativity(orientA, orientB, + alpha, A, B, beta, COrig, C, + print); PopIndent(); flush(std::cout); @@ -171,28 +141,25 @@ void TestGemm C = COrig; OutputFromRoot(g.Comm(),"Stationary B Algorithm:"); PushIndent(); + timer.Reset(); mpi::Barrier(g.Comm()); timer.Start(); - Synchronize(SyncInfoFromMatrix(C.Matrix())); - START_CUDA_TIMER; - Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_B_MS); - Synchronize(SyncInfoFromMatrix(C.Matrix())); - STOP_CUDA_TIMER; - + Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_B); mpi::Barrier(g.Comm()); - runTime = timer.Stop(); + timer.Stop(); + runTime = timer.GetTime(); realGFlops = 2.*double(m)*double(n)*double(k)/(1.e9*runTime); gFlops = (IsComplex::value ? 4*realGFlops : realGFlops); - if (D == Device::CPU) - OutputFromRoot - (g.Comm(),"Finished in ",runTime," seconds (",gFlops," GFlop/s)"); - SUMMARIZE_CUDA_TIMER; + OutputFromRoot( + g.Comm(),"Finished in ",runTime, " seconds (",gFlops," GFlop/s)"); if (print) Print(C, BuildString("C := ",alpha," A B + ",beta," C")); if (correctness) - TestAssociativity(orientA, orientB, alpha, A, B, beta, COrig, C, print); + TestAssociativity(orientA, orientB, + alpha, A, B, beta, COrig, C, + print); PopIndent(); flush(std::cout); @@ -204,20 +171,19 @@ void TestGemm C = COrig; OutputFromRoot(g.Comm(),"Stationary C Algorithm:"); PushIndent(); + timer.Reset(); mpi::Barrier(g.Comm()); timer.Start(); - START_CUDA_TIMER; - Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_C_MS); - STOP_CUDA_TIMER; - + Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_C); mpi::Barrier(g.Comm()); - runTime = timer.Stop(); + timer.Stop(); + runTime = timer.GetTime(); realGFlops = 2.*double(m)*double(n)*double(k)/(1.e9*runTime); gFlops = (IsComplex::value ? 4*realGFlops : realGFlops); - if (D == Device::CPU) - OutputFromRoot - (g.Comm(),"Finished in ",runTime," seconds (",gFlops," GFlop/s)"); - SUMMARIZE_CUDA_TIMER; + + OutputFromRoot( + g.Comm(),"Finished in ",runTime," seconds (",gFlops," GFlop/s)"); + if (print) Print(C, BuildString("C := ",alpha," A B + ",beta," C")); if (correctness) @@ -236,37 +202,32 @@ void TestGemm OutputFromRoot(g.Comm(),"Dot Product Algorithm:"); PushIndent(); C = COrig; + timer.Reset(); mpi::Barrier(g.Comm()); timer.Start(); - START_CUDA_TIMER; Gemm(NORMAL, NORMAL, alpha, A, B, beta, C, GEMM_SUMMA_DOT); - STOP_CUDA_TIMER; - mpi::Barrier(g.Comm()); - runTime = timer.Stop(); + timer.Stop(); + runTime = timer.GetTime(); realGFlops = 2.*double(m)*double(n)*double(k)/(1.e9*runTime); gFlops = (IsComplex::value ? 4*realGFlops : realGFlops); - if (D == Device::CPU) - OutputFromRoot - (g.Comm(),"Finished in ",runTime," seconds (",gFlops, - " GFlop/s)"); - SUMMARIZE_CUDA_TIMER; + OutputFromRoot( + g.Comm(),"Finished in ",runTime," seconds (",gFlops, + " GFlop/s)"); if (print) Print(C, BuildString("C := ",alpha," A B + ",beta," C")); if (correctness) TestAssociativity (orientA, orientB, alpha, A, B, beta, COrig, C, print); + PopIndent(); flush(std::cout); } } PopIndent(); -#ifdef HYDROGEN_HAVE_CUDA - cudaEventDestroy(start); - cudaEventDestroy(stop); -#endif + flush(std::cout); } int @@ -310,10 +271,10 @@ main(int argc, char* argv[]) ComplainIfDebug(); OutputFromRoot(g.Comm(),"Will test Gemm",transA,transB); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU if (testGPU) { -#ifdef HYDROGEN_GPU_USE_FP16 +#if defined HYDROGEN_HAVE_HALF && defined HYDROGEN_GPU_USE_FP16 TestGemm (orientA, orientB, m, n, k, @@ -323,7 +284,7 @@ main(int argc, char* argv[]) colAlignA, rowAlignA, colAlignB, rowAlignB, colAlignC, rowAlignC); -#endif // HYDROGEN_GPU_USE_FP16 +#endif // defined HYDROGEN_HAVE_HALF && defined HYDROGEN_GPU_USE_FP16 TestGemm (orientA, orientB, m, n, k, diff --git a/tests/blas_like/GemmHelpers/SyncTimer.hpp b/tests/blas_like/GemmHelpers/SyncTimer.hpp index 8dd7f83fed..b2fbc88bf4 100644 --- a/tests/blas_like/GemmHelpers/SyncTimer.hpp +++ b/tests/blas_like/GemmHelpers/SyncTimer.hpp @@ -68,7 +68,7 @@ class SyncTimer if (started_ || stopped_) throw std::runtime_error("Start(): Bad timer state."); - H_CHECK_CUDA(cudaEventRecord(start_, si_.stream_)); + H_CHECK_CUDA(cudaEventRecord(start_, si_.Stream())); started_ = true; } @@ -77,7 +77,7 @@ class SyncTimer if (stopped_ || !started_) throw std::runtime_error("Stop(): Bad timer state."); - H_CHECK_CUDA(cudaEventRecord(stop_, si_.stream_)); + H_CHECK_CUDA(cudaEventRecord(stop_, si_.Stream())); stopped_ = true; } @@ -103,6 +103,69 @@ class SyncTimer cudaEvent_t start_, stop_; bool started_, stopped_; };// class SyncTimer + +#elif defined(HYDROGEN_HAVE_ROCM) + +template <> +class SyncTimer +{ +public: + SyncTimer(El::SyncInfo const& si) + : si_ {si}, + started_ {false}, + stopped_ {false} + { + H_CHECK_HIP(hipEventCreate(&start_)); + H_CHECK_HIP(hipEventCreate(&stop_)); + } + + ~SyncTimer() + { + hipEventDestroy(start_); + hipEventDestroy(stop_); + } + + void Start() + { + if (started_ || stopped_) + throw std::runtime_error("Start(): Bad timer state."); + + H_CHECK_HIP(hipEventRecord(start_, si_.Stream())); + started_ = true; + } + + void Stop() + { + if (stopped_ || !started_) + throw std::runtime_error("Stop(): Bad timer state."); + + H_CHECK_HIP(hipEventRecord(stop_, si_.Stream())); + stopped_ = true; + } + + /** @brief Get elapsed time in seconds. */ + long double GetTime() const + { + if (!(started_ && stopped_)) + throw std::runtime_error("GetTime(): Bad timer state."); + + float elapsed_time_ms; + H_CHECK_HIP(hipEventSynchronize(stop_)); + H_CHECK_HIP(hipEventElapsedTime(&elapsed_time_ms, start_, stop_)); + return elapsed_time_ms / 1000.l; + } + + void Reset() + { + started_ = stopped_ = false; + } + +private: + El::SyncInfo si_; + hipEvent_t start_, stop_; + bool started_, stopped_; +};// class SyncTimer + #endif // HYDROGEN_HAVE_CUDA template diff --git a/tests/core/DistMatrix.cpp b/tests/core/DistMatrix.cpp index ad66fed2b4..23aaa628db 100644 --- a/tests/core/DistMatrix.cpp +++ b/tests/core/DistMatrix.cpp @@ -20,6 +20,7 @@ Check(DistMatrix& A, const Int height = B.Height(); const Int width = B.Width(); + SyncInfo cpu_si; OutputFromRoot (g.Comm(), @@ -29,8 +30,8 @@ Check(DistMatrix& A, ",",DeviceName(),"]"); Int colAlign = SampleUniform(0,A.ColStride()); Int rowAlign = SampleUniform(0,A.RowStride()); - mpi::Broadcast(colAlign, 0, g.Comm()); - mpi::Broadcast(rowAlign, 0, g.Comm()); + mpi::Broadcast(colAlign, 0, g.Comm(), cpu_si); + mpi::Broadcast(rowAlign, 0, g.Comm(), cpu_si); A.Align(colAlign, rowAlign); A = B; if (A.Height() != B.Height() || A.Width() != B.Width()) @@ -54,7 +55,7 @@ Check(DistMatrix& A, } Int summedErrorFlag; - mpi::AllReduce(&myErrorFlag, &summedErrorFlag, 1, mpi::SUM, g.Comm()); + mpi::AllReduce(&myErrorFlag, &summedErrorFlag, 1, mpi::SUM, g.Comm(), cpu_si); if (summedErrorFlag == 0) { @@ -66,7 +67,7 @@ Check(DistMatrix& A, } else { - OutputFromRoot(g.Comm(),"FAILED"); + OutputFromRoot(g.Comm(),"FAILED (", summedErrorFlag," ranks failed)"); if (print) Print(A, "A"); if (print) @@ -177,10 +178,12 @@ template void CheckAll(Int m, Int n, const Grid& grid, bool print) { DistMatrix A(grid); + SyncInfo cpu_si; + Int colAlign = SampleUniform(0,A.ColStride()); Int rowAlign = SampleUniform(0,A.RowStride()); - mpi::Broadcast(colAlign, 0, grid.Comm()); - mpi::Broadcast(rowAlign, 0, grid.Comm()); + mpi::Broadcast(colAlign, 0, grid.Comm(), cpu_si); + mpi::Broadcast(rowAlign, 0, grid.Comm(), cpu_si); A.Align(colAlign, rowAlign); const T center = 0; @@ -189,7 +192,7 @@ void CheckAll(Int m, Int n, const Grid& grid, bool print) CheckAll_device(A, print); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU CheckAll_device(A, print); #endif } @@ -240,7 +243,7 @@ DistMatrixTest(Int m, Int n, const Grid& grid, bool print) DistMatrixTest_device(m,n,grid,print); -#ifdef HYDROGEN_HAVE_CUDA +#ifdef HYDROGEN_HAVE_GPU DistMatrixTest_device(m,n,grid,print); #endif } @@ -251,58 +254,63 @@ main(int argc, char* argv[]) Environment env(argc, argv); mpi::Comm comm = mpi::NewWorldComm(); - try + int gridHeight = Input("--gridHeight","height of process grid",0); + const bool colMajor = Input("--colMajor","column-major ordering?",true); + const Int m = Input("--height","height of matrix",50); + const Int n = Input("--width","width of matrix",50); + const bool print = Input("--print","print wrong matrices?",false); + const bool debug = Input("--debug","wait for debugger?",false); + ProcessInput(); + PrintInputReport(); + + if (gridHeight == 0) + gridHeight = Grid::DefaultHeight(mpi::Size(comm)); + const GridOrder order = colMajor ? COLUMN_MAJOR : ROW_MAJOR; + const Grid grid(std::move(comm), gridHeight, order); + + if (debug) { - int gridHeight = Input("--gridHeight","height of process grid",0); - const bool colMajor = Input("--colMajor","column-major ordering?",true); - const Int m = Input("--height","height of matrix",50); - const Int n = Input("--width","width of matrix",50); - const bool print = Input("--print","print wrong matrices?",false); - ProcessInput(); - PrintInputReport(); - - if (gridHeight == 0) - gridHeight = Grid::DefaultHeight(mpi::Size(comm)); - const GridOrder order = colMajor ? COLUMN_MAJOR : ROW_MAJOR; - const Grid grid(std::move(comm), gridHeight, order); + volatile int x = 1; + while (x) { + hydrogen::break_on_me(); + }; + } - DistMatrixTest(m, n, grid, print); + DistMatrixTest(m, n, grid, print); - DistMatrixTest(m, n, grid, print); - DistMatrixTest>(m, n, grid, print); + DistMatrixTest(m, n, grid, print); + DistMatrixTest>(m, n, grid, print); - DistMatrixTest(m, n, grid, print); - DistMatrixTest>(m, n, grid, print); + DistMatrixTest(m, n, grid, print); + DistMatrixTest>(m, n, grid, print); #ifdef EL_HAVE_QD - DistMatrixTest(m, n, grid, print); - DistMatrixTest(m, n, grid, print); + DistMatrixTest(m, n, grid, print); + DistMatrixTest(m, n, grid, print); #endif #ifdef EL_HAVE_QUAD - DistMatrixTest(m, n, grid, print); - DistMatrixTest>(m, n, grid, print); + DistMatrixTest(m, n, grid, print); + DistMatrixTest>(m, n, grid, print); #endif #ifdef HYDROGEN_HAVE_HALF - DistMatrixTest(m, n, grid, print); + DistMatrixTest(m, n, grid, print); #endif #ifdef EL_HAVE_MPC - DistMatrixTest(m, n, grid, print); - OutputFromRoot(g.Comm(),"Setting BigInt precision to 512 bits"); - mpfr::SetMinIntBits(512); - DistMatrixTest(m, n, grid, print); - - DistMatrixTest(m, n, grid, print); - DistMatrixTest>(m, n, grid, print); - OutputFromRoot(g.Comm(),"Setting BigFloat precision to 512 bits"); - mpfr::SetPrecision(512); - DistMatrixTest(m, n, grid, print); - DistMatrixTest>(m, n, grid, print); + DistMatrixTest(m, n, grid, print); + OutputFromRoot(g.Comm(),"Setting BigInt precision to 512 bits"); + mpfr::SetMinIntBits(512); + DistMatrixTest(m, n, grid, print); + + DistMatrixTest(m, n, grid, print); + DistMatrixTest>(m, n, grid, print); + OutputFromRoot(g.Comm(),"Setting BigFloat precision to 512 bits"); + mpfr::SetPrecision(512); + DistMatrixTest(m, n, grid, print); + DistMatrixTest>(m, n, grid, print); #endif - } - catch(std::exception& e) { ReportException(e); } return 0; } diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt index 0318babe07..6df45d5c20 100644 --- a/unit_test/CMakeLists.txt +++ b/unit_test/CMakeLists.txt @@ -12,6 +12,11 @@ if (HYDROGEN_HAVE_GPU) endif () endif (HYDROGEN_HAVE_GPU) +if (HYDROGEN_HAVE_GPU) + list(APPEND HYDROGEN_CATCH2_TEST_FILES + gpu_test.cpp) +endif () + # Add the sequential test main() function add_executable(seq-catch-tests SequentialCatchMain.cpp "${HYDROGEN_CATCH2_TEST_FILES}") diff --git a/unit_test/gpu_test.cpp b/unit_test/gpu_test.cpp new file mode 100644 index 0000000000..f6429d0186 --- /dev/null +++ b/unit_test/gpu_test.cpp @@ -0,0 +1,19 @@ +#include + +#include + +TEST_CASE("Testing core GPU functionality", "[seq][gpu][init]") +{ + REQUIRE_FALSE(hydrogen::gpu::IsInitialized()); + REQUIRE(hydrogen::gpu::IsFinalized()); + + REQUIRE_NOTHROW(hydrogen::gpu::Initialize()); + + REQUIRE(hydrogen::gpu::IsInitialized()); + REQUIRE_FALSE(hydrogen::gpu::IsFinalized()); + + REQUIRE_NOTHROW(hydrogen::gpu::Finalize()); + + REQUIRE_FALSE(hydrogen::gpu::IsInitialized()); + REQUIRE(hydrogen::gpu::IsFinalized()); +}