From 7d581e9477eff1ca21dc4172c7465f965b2025cb Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Fri, 17 Jan 2020 09:39:50 -0800 Subject: [PATCH] Fix lbann half issues (#93) * tick up version to 1.3.3 * fix issues with CPU half * rework the copy interface * work on Copy * add a warmup run to the gemm test * Various updates to copy dispatch; no more ETI for Copy * fix an issue with the cuda half type's assignment operators * make gpu_half_type assignment operators into templates * Update include/El/blas_like/level1/CopyLocal.hpp Co-Authored-By: Tim Moon * remove some debugging output * add decls for BaseDistMatrix copy and copyasync * make gpu_half_type streamable * be a little more clever about casting to __half * fix some things * add overloads of sqrt and pow for half types * add unary minus for gpu half type * fix an issue where NVCC decided that rvalue references have value semantics * add an overload of Log for gpu_half_type * add exception-throwing bitwise operators for half. This is to appease Aluminum at compile-time and should never be encounted IRL. * add a bunch of missing library symbols * add a bunch of transendental functions, etc, for half types * fix some missing symbols when compiling without half support * Fixes to instantiate Read/Write with gpu half type * Add overload for instantiate with half types * add a few missing symbols * add a write impl for gpu half matrices * temp: dispatch gemv through Gemm for __half * Fix the GEMV as GEMM call * temporary error-throw for unhandled case * remove the incy != 1 gemv case * patch around an issue in the Half library * expose AbstractMatrix interface to Print() Co-authored-by: Tim Moon --- CMakeLists.txt | 3 +- cmake/modules/HydrogenETIGeneration.cmake | 31 + include/El/blas_like/level1/Copy.hpp | 842 ++---------------- .../blas_like/level1/CopyAsyncDistMatrix.hpp | 97 ++ .../El/blas_like/level1/CopyAsyncLocal.hpp | 134 +++ .../El/blas_like/level1/CopyDistMatrix.hpp | 183 ++++ include/El/blas_like/level1/CopyFromRoot.hpp | 50 ++ include/El/blas_like/level1/CopyLocal.hpp | 292 ++++++ include/El/blas_like/level1/decl.hpp | 63 +- include/El/blas_like/level1/impl.hpp | 1 - include/El/core.hpp | 11 - include/El/core/Element/decl.hpp | 88 +- include/El/core/Element/impl.hpp | 13 + include/El/core/Matrix/impl_gpu.hpp | 2 +- include/El/core/random/decl.hpp | 5 + include/El/core/random/impl.hpp | 28 +- include/El/io.hpp | 3 + include/hydrogen/blas/GPU_BLAS_impl.hpp | 43 + include/hydrogen/utils/HalfPrecision.hpp | 184 +++- .../hydrogen/utils/NumericTypeConversion.hpp | 19 + src/blas_like/level1/CMakeLists.txt | 1 + src/blas_like/level1/Copy.cpp | 113 +++ src/blas_like/level1/HilbertSchmidt.cpp | 8 +- src/blas_like/level2/Gemv.cpp | 13 + src/core/DistMatrix/AbstractDistMatrix.cpp | 55 ++ src/core/imports/blas/Dot.hpp | 8 +- src/core/imports/blas/Gemv.hpp | 75 +- src/core/imports/cuda.cpp | 1 - src/hydrogen/blas/gpu/Axpy.cu | 3 +- src/hydrogen/blas/gpu/Copy.cu | 4 +- src/hydrogen/blas/gpu/Fill.cu | 2 + src/hydrogen/blas/gpu/Scale.cu | 3 +- src/hydrogen/blas/gpu/Transpose.cu | 1 + src/io/Print.cpp | 2 + src/io/Read.cpp | 14 + src/io/Write.cpp | 23 + src/matrices/deterministic/classical/Ones.cpp | 10 +- .../deterministic/classical/Zeros.cpp | 4 + src/matrices/random/independent/Bernoulli.cpp | 18 +- src/matrices/random/independent/Gaussian.cpp | 4 + tests/blas_like/Gemm.cpp | 8 + unit_test/CMakeLists.txt | 13 +- unit_test/copy_cpu_gpu_half_test.cpp | 82 ++ unit_test/copy_cpu_gpu_test.cpp | 75 ++ 44 files changed, 1734 insertions(+), 898 deletions(-) create mode 100644 cmake/modules/HydrogenETIGeneration.cmake create mode 100644 include/El/blas_like/level1/CopyAsyncDistMatrix.hpp create mode 100644 include/El/blas_like/level1/CopyAsyncLocal.hpp create mode 100644 include/El/blas_like/level1/CopyDistMatrix.hpp create mode 100644 include/El/blas_like/level1/CopyFromRoot.hpp create mode 100644 include/El/blas_like/level1/CopyLocal.hpp create mode 100644 src/blas_like/level1/Copy.cpp create mode 100644 unit_test/copy_cpu_gpu_half_test.cpp create mode 100644 unit_test/copy_cpu_gpu_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ab73e1ef2..5c6760d528 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,7 @@ endif (__GIT_EXECUTABLE) # This must be set because version tags set(HYDROGEN_VERSION_MAJOR 1) set(HYDROGEN_VERSION_MINOR 3) -set(HYDROGEN_VERSION_PATCH 2) +set(HYDROGEN_VERSION_PATCH 3) set(HYDROGEN_VERSION_MAJOR_MINOR "${HYDROGEN_VERSION_MAJOR}.${HYDROGEN_VERSION_MINOR}") set(HYDROGEN_VERSION @@ -539,6 +539,7 @@ if (HYDROGEN_HAVE_CUDA) target_link_libraries(Hydrogen_CUDA PUBLIC ${NVTX_LIBRARIES}) target_link_libraries(Hydrogen_CUDA PUBLIC cuda::toolkit) + target_link_libraries(Hydrogen_CXX PUBLIC Hydrogen_CUDA) list(APPEND HYDROGEN_LIBRARIES Hydrogen_CUDA) endif () diff --git a/cmake/modules/HydrogenETIGeneration.cmake b/cmake/modules/HydrogenETIGeneration.cmake new file mode 100644 index 0000000000..1ec1f99d6b --- /dev/null +++ b/cmake/modules/HydrogenETIGeneration.cmake @@ -0,0 +1,31 @@ +# This function sets the _ARG# variable and recurs through the parameters. +function (h_gtpe_recur OUTVAR EXPRESSION_TEMPLATE THIS_EPARAM_ID THIS_EPARAM) + foreach (_VAL IN LISTS ${THIS_EPARAM}) + set(_ARG${THIS_EPARAM_ID} "${_VAL}") + if (ARGN) + math(EXPR _NEXT_ID "${THIS_EPARAM_ID} + 1") + h_gtpe_recur(${OUTVAR} "${EXPRESSION_TEMPLATE}" ${_NEXT_ID} ${ARGN}) + else () + string(CONFIGURE "${EXPRESSION_TEMPLATE}" _THIS_EXPRESSION @ONLY) + list(APPEND ${OUTVAR} "${_THIS_EXPRESSION}") + endif () + endforeach () + set(${OUTVAR} "${${OUTVAR}}" PARENT_SCOPE) +endfunction () + +# This function calls the recursive function above to generate each +# function instance. +# +# DO NOT ADD A SEMICOLON TO THE EXPRESSION_TEMPLATE PARAMETER!!!!!!! +function (h_generate_tensor_product_expression OUTVAR EXPRESSION_TEMPLATE) + h_gtpe_recur(_ALL_EXPRESSIONS "${EXPRESSION_TEMPLATE}" 0 ${ARGN}) + set(${OUTVAR} ${_ALL_EXPRESSIONS} PARENT_SCOPE) +endfunction () + +# This function adds a semicolon to each function instance in the list +# and joins them into a string with each function instance on its own +# line. +function (h_func_list_to_string OUTVAR INLIST) + list(JOIN ${INLIST} ";\n" _TMP) + set(${OUTVAR} "${_TMP};" PARENT_SCOPE) +endfunction () diff --git a/include/El/blas_like/level1/Copy.hpp b/include/El/blas_like/level1/Copy.hpp index 226977cdfa..7d835c4c4f 100644 --- a/include/El/blas_like/level1/Copy.hpp +++ b/include/El/blas_like/level1/Copy.hpp @@ -13,802 +13,154 @@ #include #endif +#include #include #include #include -#ifdef HYDROGEN_HAVE_GPU -#include -#endif - -namespace El { -namespace details { +#include -template