From fff5aa683d10cbad25af1f51493986bb75de1a31 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 21 Aug 2024 08:46:42 -0400 Subject: [PATCH 01/40] Remove NVRTC workaround from log1p --- .../boost/math/special_functions/log1p.hpp | 64 +++++-------------- include/boost/math/tools/series.hpp | 1 + test/test_log1p_nvrtc_double.cpp | 4 +- test/test_log1p_nvrtc_float.cpp | 12 ++-- 4 files changed, 25 insertions(+), 56 deletions(-) diff --git a/include/boost/math/special_functions/log1p.hpp b/include/boost/math/special_functions/log1p.hpp index cdec8ee86..758f60668 100644 --- a/include/boost/math/special_functions/log1p.hpp +++ b/include/boost/math/special_functions/log1p.hpp @@ -13,15 +13,13 @@ #endif #include - -#ifndef BOOST_MATH_HAS_NVRTC - -#include -#include -#include #include #include #include +#include +#include +#include +#include #include #include #include @@ -82,7 +80,7 @@ namespace detail // it performs no better than log(1+x): which is to say not very well at all. // template -BOOST_MATH_GPU_ENABLED T log1p_imp(T const & x, const Policy& pol, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T log1p_imp(T const & x, const Policy& pol, const boost::math::integral_constant&) { // The function returns the natural logarithm of 1 + x. typedef typename tools::promote_args::type result_type; BOOST_MATH_STD_USING @@ -104,7 +102,7 @@ BOOST_MATH_GPU_ENABLED T log1p_imp(T const & x, const Policy& pol, const std::in if(a < tools::epsilon()) return x; detail::log1p_series s(x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); result_type result = tools::sum_series(s, policies::get_epsilon(), max_iter); @@ -113,7 +111,7 @@ BOOST_MATH_GPU_ENABLED T log1p_imp(T const & x, const Policy& pol, const std::in } template -BOOST_MATH_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const boost::math::integral_constant&) { // The function returns the natural logarithm of 1 + x. BOOST_MATH_STD_USING @@ -166,7 +164,7 @@ BOOST_MATH_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const std::int } template -BOOST_MATH_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const boost::math::integral_constant&) { // The function returns the natural logarithm of 1 + x. BOOST_MATH_STD_USING @@ -221,7 +219,7 @@ BOOST_MATH_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const std::int } template -BOOST_MATH_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const boost::math::integral_constant&) { // The function returns the natural logarithm of 1 + x. BOOST_MATH_STD_USING @@ -276,8 +274,8 @@ struct log1p_initializer do_init(tag()); } template - BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&){} - BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&){} + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::log1p(static_cast(0.25), Policy()); } @@ -286,7 +284,9 @@ struct log1p_initializer BOOST_MATH_STATIC const init initializer; BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -309,7 +309,7 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type log1p(T x, c policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - typedef std::integral_constant::type return -x * x / 2; boost::math::detail::log1p_series s(x); s(); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); T result = boost::math::tools::sum_series(s, policies::get_epsilon(), max_iter); @@ -476,40 +476,6 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type log1pmx(T x) } // namespace math } // namespace boost -#else // Special handling for NVRTC platform - -namespace boost { -namespace math { - -template -BOOST_MATH_GPU_ENABLED auto log1p(T x) -{ - return ::log1p(x); -} - -template <> -BOOST_MATH_GPU_ENABLED auto log1p(float x) -{ - return ::log1pf(x); -} - -template -BOOST_MATH_GPU_ENABLED auto log1p(T x, const Policy&) -{ - return ::log1p(x); -} - -template -BOOST_MATH_GPU_ENABLED auto log1p(float x, const Policy&) -{ - return ::log1pf(x); -} - -} // namespace math -} // namespace boost - -#endif // BOOST_MATH_HAS_NVRTC - #ifdef _MSC_VER #pragma warning(pop) #endif diff --git a/include/boost/math/tools/series.hpp b/include/boost/math/tools/series.hpp index 50f2828bb..4617ea3df 100644 --- a/include/boost/math/tools/series.hpp +++ b/include/boost/math/tools/series.hpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace boost{ namespace math{ namespace tools{ diff --git a/test/test_log1p_nvrtc_double.cpp b/test/test_log1p_nvrtc_double.cpp index 722194be6..36b0771b1 100644 --- a/test/test_log1p_nvrtc_double.cpp +++ b/test/test_log1p_nvrtc_double.cpp @@ -85,9 +85,9 @@ int main() nvrtcAddNameExpression(prog, "test_log1p_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN - const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/"}; + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; #else - const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/"}; + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; #endif // Compile the program diff --git a/test/test_log1p_nvrtc_float.cpp b/test/test_log1p_nvrtc_float.cpp index 772c50cd1..7194ffb56 100644 --- a/test/test_log1p_nvrtc_float.cpp +++ b/test/test_log1p_nvrtc_float.cpp @@ -7,6 +7,11 @@ #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error #define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +// Must be included first +#include +#include +#include + #include #include #include @@ -14,9 +19,6 @@ #include #include #include -#include -#include -#include typedef float float_type; @@ -85,9 +87,9 @@ int main() nvrtcAddNameExpression(prog, "test_log1p_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN - const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/"}; + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; #else - const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/"}; + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; #endif // Compile the program From 08895c7db93fb12e1fe6e2873e7f5f8bb8687833 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 11:00:32 -0400 Subject: [PATCH 02/40] Fix igamma_large support on device --- .../special_functions/detail/igamma_large.hpp | 73 +++++++++++-------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/include/boost/math/special_functions/detail/igamma_large.hpp b/include/boost/math/special_functions/detail/igamma_large.hpp index 1fa13c692..13790ca82 100644 --- a/include/boost/math/special_functions/detail/igamma_large.hpp +++ b/include/boost/math/special_functions/detail/igamma_large.hpp @@ -61,6 +61,7 @@ #endif #include +#include namespace boost{ namespace math{ namespace detail{ @@ -68,7 +69,7 @@ namespace boost{ namespace math{ namespace detail{ // when T is unsuitable to be passed to these routines: // template -inline T igamma_temme_large(T, T, const Policy& /* pol */, const std::integral_constant&) +inline T igamma_temme_large(T, T, const Policy& /* pol */, const boost::math::integral_constant&) { // stub function, should never actually be called BOOST_MATH_ASSERT(0); @@ -78,8 +79,11 @@ inline T igamma_temme_large(T, T, const Policy& /* pol */, const std::integral_c // This version is accurate for up to 64-bit mantissa's, // (80-bit long double, or 10^-20). // + +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + template -T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const boost::math::integral_constant&) { BOOST_MATH_STD_USING // ADL of std functions T sigma = (x - a) / a; @@ -91,7 +95,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant -BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const boost::math::integral_constant&) { BOOST_MATH_STD_USING // ADL of std functions T sigma = (x - a) / a; @@ -426,7 +433,7 @@ BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const s // (IEEE float precision, or 10^-8) // template -BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const boost::math::integral_constant&) { BOOST_MATH_STD_USING // ADL of std functions T sigma = (x - a) / a; @@ -481,8 +488,10 @@ BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const s // It's use for a < 200 is not recommended, that would // require many more terms in the polynomials. // +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + template -T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const boost::math::integral_constant&) { BOOST_MATH_STD_USING // ADL of std functions T sigma = (x - a) / a; @@ -494,7 +503,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, const std::integral_constant Date: Wed, 21 Aug 2024 09:25:39 -0400 Subject: [PATCH 03/40] Add GPU support to toms748 --- include/boost/math/tools/toms748_solve.hpp | 96 +++++++++++++--------- 1 file changed, 55 insertions(+), 41 deletions(-) diff --git a/include/boost/math/tools/toms748_solve.hpp b/include/boost/math/tools/toms748_solve.hpp index ea9371322..dee234685 100644 --- a/include/boost/math/tools/toms748_solve.hpp +++ b/include/boost/math/tools/toms748_solve.hpp @@ -1,4 +1,5 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,13 +11,13 @@ #pragma once #endif +#include #include +#include +#include +#include #include -#include #include -#include -#include -#include #ifdef BOOST_MATH_LOG_ROOT_ITERATIONS # define BOOST_MATH_LOGGER_INCLUDE @@ -32,29 +33,36 @@ template class eps_tolerance { public: - eps_tolerance() : eps(4 * tools::epsilon()) + BOOST_MATH_GPU_ENABLED eps_tolerance() : eps(4 * tools::epsilon()) { } - eps_tolerance(unsigned bits) + BOOST_MATH_GPU_ENABLED eps_tolerance(unsigned bits) { BOOST_MATH_STD_USING - eps = (std::max)(T(ldexp(1.0F, 1-bits)), T(4 * tools::epsilon())); + eps = BOOST_MATH_GPU_SAFE_MAX(T(ldexp(1.0F, 1-bits)), T(4 * tools::epsilon())); } - bool operator()(const T& a, const T& b) + BOOST_MATH_GPU_ENABLED bool operator()(const T& a, const T& b) { BOOST_MATH_STD_USING - return fabs(a - b) <= (eps * (std::min)(fabs(a), fabs(b))); + return fabs(a - b) <= (eps * BOOST_MATH_GPU_SAFE_MIN(fabs(a), fabs(b))); } private: T eps; }; +// CUDA warns about __host__ __device__ marker on defaulted constructor +// but the warning is benign +#ifdef BOOST_MATH_ENABLE_CUDA +# pragma nv_diag_suppress 20012 +#endif + struct equal_floor { - equal_floor()= default; + BOOST_MATH_GPU_ENABLED equal_floor() = default; + template - bool operator()(const T& a, const T& b) + BOOST_MATH_GPU_ENABLED bool operator()(const T& a, const T& b) { BOOST_MATH_STD_USING return (floor(a) == floor(b)) || (fabs((b-a)/b) < boost::math::tools::epsilon() * 2); @@ -63,9 +71,10 @@ struct equal_floor struct equal_ceil { - equal_ceil()= default; + BOOST_MATH_GPU_ENABLED equal_ceil() = default; + template - bool operator()(const T& a, const T& b) + BOOST_MATH_GPU_ENABLED bool operator()(const T& a, const T& b) { BOOST_MATH_STD_USING return (ceil(a) == ceil(b)) || (fabs((b - a) / b) < boost::math::tools::epsilon() * 2); @@ -74,19 +83,24 @@ struct equal_ceil struct equal_nearest_integer { - equal_nearest_integer()= default; + BOOST_MATH_GPU_ENABLED equal_nearest_integer() = default; + template - bool operator()(const T& a, const T& b) + BOOST_MATH_GPU_ENABLED bool operator()(const T& a, const T& b) { BOOST_MATH_STD_USING return (floor(a + 0.5f) == floor(b + 0.5f)) || (fabs((b - a) / b) < boost::math::tools::epsilon() * 2); } }; +#ifdef BOOST_MATH_ENABLE_CUDA +# pragma nv_diag_default 20012 +#endif + namespace detail{ template -void bracket(F f, T& a, T& b, T c, T& fa, T& fb, T& d, T& fd) +BOOST_MATH_GPU_ENABLED void bracket(F f, T& a, T& b, T c, T& fa, T& fb, T& d, T& fd) { // // Given a point c inside the existing enclosing interval @@ -150,7 +164,7 @@ void bracket(F f, T& a, T& b, T c, T& fa, T& fb, T& d, T& fd) } template -inline T safe_div(T num, T denom, T r) +BOOST_MATH_GPU_ENABLED inline T safe_div(T num, T denom, T r) { // // return num / denom without overflow, @@ -167,7 +181,7 @@ inline T safe_div(T num, T denom, T r) } template -inline T secant_interpolate(const T& a, const T& b, const T& fa, const T& fb) +BOOST_MATH_GPU_ENABLED inline T secant_interpolate(const T& a, const T& b, const T& fa, const T& fb) { // // Performs standard secant interpolation of [a,b] given @@ -188,9 +202,9 @@ inline T secant_interpolate(const T& a, const T& b, const T& fa, const T& fb) } template -T quadratic_interpolate(const T& a, const T& b, T const& d, - const T& fa, const T& fb, T const& fd, - unsigned count) +BOOST_MATH_GPU_ENABLED T quadratic_interpolate(const T& a, const T& b, T const& d, + const T& fa, const T& fb, T const& fd, + unsigned count) { // // Performs quadratic interpolation to determine the next point, @@ -244,9 +258,9 @@ T quadratic_interpolate(const T& a, const T& b, T const& d, } template -T cubic_interpolate(const T& a, const T& b, const T& d, - const T& e, const T& fa, const T& fb, - const T& fd, const T& fe) +BOOST_MATH_GPU_ENABLED T cubic_interpolate(const T& a, const T& b, const T& d, + const T& e, const T& fa, const T& fb, + const T& fd, const T& fe) { // // Uses inverse cubic interpolation of f(x) at points @@ -293,7 +307,7 @@ T cubic_interpolate(const T& a, const T& b, const T& d, } // namespace detail template -std::pair toms748_solve(F f, const T& ax, const T& bx, const T& fax, const T& fbx, Tol tol, std::uintmax_t& max_iter, const Policy& pol) +BOOST_MATH_GPU_ENABLED boost::math::pair toms748_solve(F f, const T& ax, const T& bx, const T& fax, const T& fbx, Tol tol, boost::math::uintmax_t& max_iter, const Policy& pol) { // // Main entry point and logic for Toms Algorithm 748 @@ -301,15 +315,15 @@ std::pair toms748_solve(F f, const T& ax, const T& bx, const T& fax, const // BOOST_MATH_STD_USING // For ADL of std math functions - static const char* function = "boost::math::tools::toms748_solve<%1%>"; + constexpr auto function = "boost::math::tools::toms748_solve<%1%>"; // // Sanity check - are we allowed to iterate at all? // if (max_iter == 0) - return std::make_pair(ax, bx); + return boost::math::make_pair(ax, bx); - std::uintmax_t count = max_iter; + boost::math::uintmax_t count = max_iter; T a, b, fa, fb, c, u, fu, a0, b0, d, fd, e, fe; static const T mu = 0.5f; @@ -330,7 +344,7 @@ std::pair toms748_solve(F f, const T& ax, const T& bx, const T& fax, const b = a; else if(fb == 0) a = b; - return std::make_pair(a, b); + return boost::math::make_pair(a, b); } if(boost::math::sign(fa) * boost::math::sign(fb) > 0) @@ -472,37 +486,37 @@ std::pair toms748_solve(F f, const T& ax, const T& bx, const T& fax, const a = b; } BOOST_MATH_LOG_COUNT(max_iter) - return std::make_pair(a, b); + return boost::math::make_pair(a, b); } template -inline std::pair toms748_solve(F f, const T& ax, const T& bx, const T& fax, const T& fbx, Tol tol, std::uintmax_t& max_iter) +BOOST_MATH_GPU_ENABLED inline boost::math::pair toms748_solve(F f, const T& ax, const T& bx, const T& fax, const T& fbx, Tol tol, boost::math::uintmax_t& max_iter) { return toms748_solve(f, ax, bx, fax, fbx, tol, max_iter, policies::policy<>()); } template -inline std::pair toms748_solve(F f, const T& ax, const T& bx, Tol tol, std::uintmax_t& max_iter, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline boost::math::pair toms748_solve(F f, const T& ax, const T& bx, Tol tol, boost::math::uintmax_t& max_iter, const Policy& pol) { if (max_iter <= 2) - return std::make_pair(ax, bx); + return boost::math::make_pair(ax, bx); max_iter -= 2; - std::pair r = toms748_solve(f, ax, bx, f(ax), f(bx), tol, max_iter, pol); + boost::math::pair r = toms748_solve(f, ax, bx, f(ax), f(bx), tol, max_iter, pol); max_iter += 2; return r; } template -inline std::pair toms748_solve(F f, const T& ax, const T& bx, Tol tol, std::uintmax_t& max_iter) +BOOST_MATH_GPU_ENABLED inline boost::math::pair toms748_solve(F f, const T& ax, const T& bx, Tol tol, boost::math::uintmax_t& max_iter) { return toms748_solve(f, ax, bx, tol, max_iter, policies::policy<>()); } template -std::pair bracket_and_solve_root(F f, const T& guess, T factor, bool rising, Tol tol, std::uintmax_t& max_iter, const Policy& pol) +BOOST_MATH_GPU_ENABLED boost::math::pair bracket_and_solve_root(F f, const T& guess, T factor, bool rising, Tol tol, boost::math::uintmax_t& max_iter, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::tools::bracket_and_solve_root<%1%>"; + constexpr auto function = "boost::math::tools::bracket_and_solve_root<%1%>"; // // Set up initial brackets: // @@ -513,7 +527,7 @@ std::pair bracket_and_solve_root(F f, const T& guess, T factor, bool risin // // Set up invocation count: // - std::uintmax_t count = max_iter - 1; + boost::math::uintmax_t count = max_iter - 1; int step = 32; @@ -563,7 +577,7 @@ std::pair bracket_and_solve_root(F f, const T& guess, T factor, bool risin // Escape route just in case the answer is zero! max_iter -= count; max_iter += 1; - return a > 0 ? std::make_pair(T(0), T(a)) : std::make_pair(T(a), T(0)); + return a > 0 ? boost::math::make_pair(T(0), T(a)) : boost::math::make_pair(T(a), T(0)); } if(count == 0) return boost::math::detail::pair_from_single(policies::raise_evaluation_error(function, "Unable to bracket root, last nearest value was %1%", a, pol)); @@ -592,7 +606,7 @@ std::pair bracket_and_solve_root(F f, const T& guess, T factor, bool risin } max_iter -= count; max_iter += 1; - std::pair r = toms748_solve( + boost::math::pair r = toms748_solve( f, (a < 0 ? b : a), (a < 0 ? a : b), @@ -608,7 +622,7 @@ std::pair bracket_and_solve_root(F f, const T& guess, T factor, bool risin } template -inline std::pair bracket_and_solve_root(F f, const T& guess, const T& factor, bool rising, Tol tol, std::uintmax_t& max_iter) +BOOST_MATH_GPU_ENABLED inline boost::math::pair bracket_and_solve_root(F f, const T& guess, const T& factor, bool rising, Tol tol, boost::math::uintmax_t& max_iter) { return bracket_and_solve_root(f, guess, factor, rising, tol, max_iter, policies::policy<>()); } From 94ca4c88bfc69333fa6dccbffb13af6d55d5024b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 21 Aug 2024 09:27:08 -0400 Subject: [PATCH 04/40] Add GPU support to igamma_inv --- .../detail/igamma_inverse.hpp | 65 +++++++++++++------ 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/include/boost/math/special_functions/detail/igamma_inverse.hpp b/include/boost/math/special_functions/detail/igamma_inverse.hpp index f6bbcd72d..fae36dddd 100644 --- a/include/boost/math/special_functions/detail/igamma_inverse.hpp +++ b/include/boost/math/special_functions/detail/igamma_inverse.hpp @@ -1,4 +1,5 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,6 +11,8 @@ #pragma once #endif +#include +#include #include #include #include @@ -21,7 +24,7 @@ namespace boost{ namespace math{ namespace detail{ template -T find_inverse_s(T p, T q) +BOOST_MATH_GPU_ENABLED T find_inverse_s(T p, T q) { // // Computation of the Incomplete Gamma Function Ratios and their Inverse @@ -41,8 +44,8 @@ T find_inverse_s(T p, T q) { t = sqrt(-2 * log(q)); } - static const double a[4] = { 3.31125922108741, 11.6616720288968, 4.28342155967104, 0.213623493715853 }; - static const double b[5] = { 1, 6.61053765625462, 6.40691597760039, 1.27364489782223, 0.3611708101884203e-1 }; + BOOST_MATH_STATIC const double a[4] = { 3.31125922108741, 11.6616720288968, 4.28342155967104, 0.213623493715853 }; + BOOST_MATH_STATIC const double b[5] = { 1, 6.61053765625462, 6.40691597760039, 1.27364489782223, 0.3611708101884203e-1 }; T s = t - tools::evaluate_polynomial(a, t) / tools::evaluate_polynomial(b, t); if(p < T(0.5)) s = -s; @@ -50,7 +53,7 @@ T find_inverse_s(T p, T q) } template -T didonato_SN(T a, T x, unsigned N, T tolerance = 0) +BOOST_MATH_GPU_ENABLED T didonato_SN(T a, T x, unsigned N, T tolerance = 0) { // // Computation of the Incomplete Gamma Function Ratios and their Inverse @@ -77,7 +80,7 @@ T didonato_SN(T a, T x, unsigned N, T tolerance = 0) } template -inline T didonato_FN(T p, T a, T x, unsigned N, T tolerance, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T didonato_FN(T p, T a, T x, unsigned N, T tolerance, const Policy& pol) { // // Computation of the Incomplete Gamma Function Ratios and their Inverse @@ -93,7 +96,7 @@ inline T didonato_FN(T p, T a, T x, unsigned N, T tolerance, const Policy& pol) } template -T find_inverse_gamma(T a, T p, T q, const Policy& pol, bool* p_has_10_digits) +BOOST_MATH_GPU_ENABLED T find_inverse_gamma(T a, T p, T q, const Policy& pol, bool* p_has_10_digits) { // // In order to understand what's going on here, you will @@ -233,7 +236,7 @@ T find_inverse_gamma(T a, T p, T q, const Policy& pol, bool* p_has_10_digits) } else { - T D = (std::max)(T(2), T(a * (a - 1))); + T D = BOOST_MATH_GPU_SAFE_MAX(T(2), T(a * (a - 1))); T lg = boost::math::lgamma(a, pol); T lb = log(q) + lg; if(lb < -D * T(2.3)) @@ -315,7 +318,7 @@ T find_inverse_gamma(T a, T p, T q, const Policy& pol, bool* p_has_10_digits) template struct gamma_p_inverse_func { - gamma_p_inverse_func(T a_, T p_, bool inv) : a(a_), p(p_), invert(inv) + BOOST_MATH_GPU_ENABLED gamma_p_inverse_func(T a_, T p_, bool inv) : a(a_), p(p_), invert(inv) { // // If p is too near 1 then P(x) - p suffers from cancellation @@ -333,7 +336,7 @@ struct gamma_p_inverse_func } } - boost::math::tuple operator()(const T& x)const + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& x)const { BOOST_FPU_EXCEPTION_GUARD // @@ -395,11 +398,11 @@ struct gamma_p_inverse_func }; template -T gamma_p_inv_imp(T a, T p, const Policy& pol) +BOOST_MATH_GPU_ENABLED T gamma_p_inv_imp(T a, T p, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std functions. - static const char* function = "boost::math::gamma_p_inv<%1%>(%1%, %1%)"; + BOOST_MATH_STATIC const char* function = "boost::math::gamma_p_inv<%1%>(%1%, %1%)"; BOOST_MATH_INSTRUMENT_VARIABLE(a); BOOST_MATH_INSTRUMENT_VARIABLE(p); @@ -442,7 +445,9 @@ T gamma_p_inv_imp(T a, T p, const Policy& pol) // // Go ahead and iterate: // - std::uintmax_t max_iter = policies::get_max_root_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_root_iterations(); + + #ifndef BOOST_MATH_HAS_GPU_SUPPORT guess = tools::halley_iterate( detail::gamma_p_inverse_func(a, p, false), guess, @@ -450,6 +455,16 @@ T gamma_p_inv_imp(T a, T p, const Policy& pol) tools::max_value(), digits, max_iter); + #else + guess = tools::newton_raphson_iterate( + detail::gamma_p_inverse_func(a, p, false), + guess, + lower, + tools::max_value(), + digits, + max_iter); + #endif + policies::check_root_iterations(function, max_iter, pol); BOOST_MATH_INSTRUMENT_VARIABLE(guess); if(guess == lower) @@ -458,11 +473,11 @@ T gamma_p_inv_imp(T a, T p, const Policy& pol) } template -T gamma_q_inv_imp(T a, T q, const Policy& pol) +BOOST_MATH_GPU_ENABLED T gamma_q_inv_imp(T a, T q, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std functions. - static const char* function = "boost::math::gamma_q_inv<%1%>(%1%, %1%)"; + BOOST_MATH_STATIC const char* function = "boost::math::gamma_q_inv<%1%>(%1%, %1%)"; if(a <= 0) return policies::raise_domain_error(function, "Argument a in the incomplete gamma function inverse must be >= 0 (got a=%1%).", a, pol); @@ -501,7 +516,9 @@ T gamma_q_inv_imp(T a, T q, const Policy& pol) // // Go ahead and iterate: // - std::uintmax_t max_iter = policies::get_max_root_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_root_iterations(); + + #ifndef BOOST_MATH_HAS_GPU_SUPPORT guess = tools::halley_iterate( detail::gamma_p_inverse_func(a, q, true), guess, @@ -509,6 +526,16 @@ T gamma_q_inv_imp(T a, T q, const Policy& pol) tools::max_value(), digits, max_iter); + #else + guess = tools::newton_raphson_iterate( + detail::gamma_p_inverse_func(a, q, true), + guess, + lower, + tools::max_value(), + digits, + max_iter); + #endif + policies::check_root_iterations(function, max_iter, pol); if(guess == lower) guess = policies::raise_underflow_error(function, "Expected result known to be non-zero, but is smaller than the smallest available number.", pol); @@ -518,7 +545,7 @@ T gamma_q_inv_imp(T a, T q, const Policy& pol) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type gamma_p_inv(T1 a, T2 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -528,7 +555,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type gamma_q_inv(T1 a, T2 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -538,14 +565,14 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type gamma_p_inv(T1 a, T2 p) { return gamma_p_inv(a, p, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type gamma_q_inv(T1 a, T2 p) { return gamma_q_inv(a, p, policies::policy<>()); From bbbf599755537cd1a54c7cdf880d0ed75e6194df Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 21 Aug 2024 09:28:51 -0400 Subject: [PATCH 05/40] Add GPU markers to gamma_inva --- .../special_functions/detail/gamma_inva.hpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/include/boost/math/special_functions/detail/gamma_inva.hpp b/include/boost/math/special_functions/detail/gamma_inva.hpp index 75ac89e43..61c64a648 100644 --- a/include/boost/math/special_functions/detail/gamma_inva.hpp +++ b/include/boost/math/special_functions/detail/gamma_inva.hpp @@ -1,4 +1,5 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -17,7 +18,7 @@ #pragma once #endif -#include +#include #include namespace boost{ namespace math{ namespace detail{ @@ -25,8 +26,8 @@ namespace boost{ namespace math{ namespace detail{ template struct gamma_inva_t { - gamma_inva_t(T z_, T p_, bool invert_) : z(z_), p(p_), invert(invert_) {} - T operator()(T a) + BOOST_MATH_GPU_ENABLED gamma_inva_t(T z_, T p_, bool invert_) : z(z_), p(p_), invert(invert_) {} + BOOST_MATH_GPU_ENABLED T operator()(T a) { return invert ? p - boost::math::gamma_q(a, z, Policy()) : boost::math::gamma_p(a, z, Policy()) - p; } @@ -36,7 +37,7 @@ struct gamma_inva_t }; template -T inverse_poisson_cornish_fisher(T lambda, T p, T q, const Policy& pol) +BOOST_MATH_GPU_ENABLED T inverse_poisson_cornish_fisher(T lambda, T p, T q, const Policy& pol) { BOOST_MATH_STD_USING // mean: @@ -67,7 +68,7 @@ T inverse_poisson_cornish_fisher(T lambda, T p, T q, const Policy& pol) } template -T gamma_inva_imp(const T& z, const T& p, const T& q, const Policy& pol) +BOOST_MATH_GPU_ENABLED T gamma_inva_imp(const T& z, const T& p, const T& q, const Policy& pol) { BOOST_MATH_STD_USING // for ADL of std lib math functions // @@ -151,7 +152,7 @@ T gamma_inva_imp(const T& z, const T& p, const T& q, const Policy& pol) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type gamma_p_inva(T1 x, T2 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -181,7 +182,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type gamma_q_inva(T1 x, T2 q, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -211,14 +212,14 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type gamma_p_inva(T1 x, T2 p) { return boost::math::gamma_p_inva(x, p, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type gamma_q_inva(T1 x, T2 q) { return boost::math::gamma_q_inva(x, q, policies::policy<>()); From e5feb4793d94634f5fd0ee7bc14f5aef133428e8 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 13:48:15 -0400 Subject: [PATCH 06/40] Add GPU Markers to lgamma_small --- .../special_functions/detail/lgamma_small.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/include/boost/math/special_functions/detail/lgamma_small.hpp b/include/boost/math/special_functions/detail/lgamma_small.hpp index d9fa88b8e..1ce52fac3 100644 --- a/include/boost/math/special_functions/detail/lgamma_small.hpp +++ b/include/boost/math/special_functions/detail/lgamma_small.hpp @@ -13,6 +13,9 @@ #include #include +#include +#include +#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -32,13 +35,13 @@ namespace boost{ namespace math{ namespace detail{ template BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l); template -T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos& l); +BOOST_MATH_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos& l); // // lgamma for small arguments: // template -BOOST_MATH_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, const Policy& /* l */, const Lanczos&) +BOOST_MATH_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const boost::math::integral_constant&, const Policy& /* l */, const Lanczos&) { // This version uses rational approximations for small // values of z accurate enough for 64-bit mantissas @@ -226,8 +229,10 @@ BOOST_MATH_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const std::integral } return result; } + +#ifndef BOOST_MATH_HAS_GPU_SUPPORT template -T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, const Policy& /* l */, const Lanczos&) +T lgamma_small_imp(T z, T zm1, T zm2, const boost::math::integral_constant&, const Policy& /* l */, const Lanczos&) { // // This version uses rational approximations for small @@ -484,7 +489,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, c return result; } template -T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, const Policy& pol, const Lanczos& l) +BOOST_MATH_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const boost::math::integral_constant&, const Policy& pol, const Lanczos& l) { // // No rational approximations are available because either @@ -528,6 +533,8 @@ T lgamma_small_imp(T z, T zm1, T zm2, const std::integral_constant&, con return result; } +#endif // BOOST_MATH_HAS_GPU_SUPPORT + }}} // namespaces #endif // BOOST_MATH_SPECIAL_FUNCTIONS_DETAIL_LGAMMA_SMALL From 4fa7d1cb09fb7bc61215748f8cbf999a92eddf5d Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 15:48:15 -0400 Subject: [PATCH 07/40] Remove STL usage from gamma --- .../boost/math/special_functions/gamma.hpp | 78 ++++++++++--------- 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 9268ba415..e5990ca4f 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include #include @@ -36,12 +38,12 @@ #include #include #include + +// Only needed for types larger than double +#ifndef BOOST_MATH_HAS_GPU_SUPPORT #include #include - -#include -#include -#include +#endif #ifdef _MSC_VER # pragma warning(push) @@ -60,13 +62,13 @@ namespace boost{ namespace math{ namespace detail{ template -BOOST_MATH_GPU_ENABLED inline bool is_odd(T v, const std::true_type&) +BOOST_MATH_GPU_ENABLED inline bool is_odd(T v, const boost::math::true_type&) { int i = static_cast(v); return i&1; } template -BOOST_MATH_GPU_ENABLED inline bool is_odd(T v, const std::false_type&) +BOOST_MATH_GPU_ENABLED inline bool is_odd(T v, const boost::math::false_type&) { // Oh dear can't cast T to int! BOOST_MATH_STD_USING @@ -76,7 +78,7 @@ BOOST_MATH_GPU_ENABLED inline bool is_odd(T v, const std::false_type&) template BOOST_MATH_GPU_ENABLED inline bool is_odd(T v) { - return is_odd(v, ::std::is_convertible()); + return is_odd(v, ::boost::math::is_convertible()); } template @@ -259,7 +261,7 @@ BOOST_MATH_GPU_ENABLED T lgamma_imp_final(T z, const Policy& pol, const Lanczos& else if(z < 15) { typedef typename policies::precision::type precision_type; - typedef std::integral_constant(z, T(z - 1), T(z - 2), tag_type(), pol, l); } - else if((z >= 3) && (z < 100) && (std::numeric_limits::max_exponent >= 1024)) + else if((z >= 3) && (z < 100) && (boost::math::numeric_limits::max_exponent >= 1024)) { // taking the log of tgamma reduces the error, no danger of overflow here: result = log(gamma_imp(z, pol, l)); @@ -349,7 +351,7 @@ struct upper_incomplete_gamma_fract T z, a; int k; public: - typedef std::pair result_type; + typedef boost::math::pair result_type; BOOST_MATH_GPU_ENABLED upper_incomplete_gamma_fract(T a1, T z1) : z(z1-a1+1), a(a1), k(0) @@ -399,7 +401,7 @@ BOOST_MATH_GPU_ENABLED inline T lower_gamma_series(T a, T z, const Policy& pol, // lower incomplete integral. Then divide by tgamma(a) // to get the normalised value. lower_incomplete_gamma_series s(a, z); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); T factor = policies::get_epsilon(); T result = boost::math::tools::sum_series(s, factor, max_iter, init_value); policies::check_series_iterations("boost::math::detail::lower_gamma_series<%1%>(%1%)", max_iter, pol); @@ -411,14 +413,14 @@ BOOST_MATH_GPU_ENABLED inline T lower_gamma_series(T a, T z, const Policy& pol, // with Bernoulli numbers. // template -std::size_t highest_bernoulli_index() +boost::math::size_t highest_bernoulli_index() { - const float digits10_of_type = (std::numeric_limits::is_specialized - ? static_cast(std::numeric_limits::digits10) + const float digits10_of_type = (boost::math::numeric_limits::is_specialized + ? static_cast(boost::math::numeric_limits::digits10) : static_cast(boost::math::tools::digits() * 0.301F)); // Find the high index n for Bn to produce the desired precision in Stirling's calculation. - return static_cast(18.0F + (0.6F * digits10_of_type)); + return static_cast(18.0F + (0.6F * digits10_of_type)); } template @@ -426,8 +428,8 @@ int minimum_argument_for_bernoulli_recursion() { BOOST_MATH_STD_USING - const float digits10_of_type = (std::numeric_limits::is_specialized - ? (float) std::numeric_limits::digits10 + const float digits10_of_type = (boost::math::numeric_limits::is_specialized + ? (float) boost::math::numeric_limits::digits10 : (float) (boost::math::tools::digits() * 0.301F)); int min_arg = (int) (digits10_of_type * 1.7F); @@ -449,7 +451,7 @@ int minimum_argument_for_bernoulli_recursion() const float d2_minus_one = ((digits10_of_type / 0.301F) - 1.0F); const float limit = ceil(exp((d2_minus_one * log(2.0F)) / 20.0F)); - min_arg = (int) ((std::min)(digits10_of_type * 1.7F, limit)); + min_arg = (int) (BOOST_MATH_GPU_SAFE_MIN(digits10_of_type * 1.7F, limit)); } return min_arg; @@ -468,7 +470,7 @@ T scaled_tgamma_no_lanczos(const T& z, const Policy& pol, bool islog = false) // Perform the Bernoulli series expansion of Stirling's approximation. - const std::size_t number_of_bernoullis_b2n = policies::get_max_series_iterations(); + const boost::math::size_t number_of_bernoullis_b2n = policies::get_max_series_iterations(); T one_over_x_pow_two_n_minus_one = 1 / z; const T one_over_x2 = one_over_x_pow_two_n_minus_one * one_over_x_pow_two_n_minus_one; @@ -477,11 +479,11 @@ T scaled_tgamma_no_lanczos(const T& z, const Policy& pol, bool islog = false) const T half_ln_two_pi_over_z = sqrt(boost::math::constants::two_pi() / z); T last_term = 2 * sum; - for (std::size_t n = 2U;; ++n) + for (boost::math::size_t n = 2U;; ++n) { one_over_x_pow_two_n_minus_one *= one_over_x2; - const std::size_t n2 = static_cast(n * 2U); + const boost::math::size_t n2 = static_cast(n * 2U); const T term = (boost::math::bernoulli_b2n(static_cast(n)) * one_over_x_pow_two_n_minus_one) / (n2 * (n2 - 1U)); @@ -786,7 +788,7 @@ BOOST_MATH_GPU_ENABLED T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& typedef typename policies::precision::type precision_type; - typedef std::integral_constant()) || ((std::max)(alz, amz) >= tools::log_max_value())) + if((BOOST_MATH_GPU_SAFE_MIN(alz, amz) <= tools::log_min_value()) || (BOOST_MATH_GPU_SAFE_MAX(alz, amz) >= tools::log_max_value())) { T amza = amz / a; - if(((std::min)(alz, amz)/2 > tools::log_min_value()) && ((std::max)(alz, amz)/2 < tools::log_max_value())) + if((BOOST_MATH_GPU_SAFE_MIN(alz, amz)/2 > tools::log_min_value()) && (BOOST_MATH_GPU_SAFE_MAX(alz, amz)/2 < tools::log_max_value())) { // compute square root of the result and then square it: T sq = pow(z / agh, a / 2) * exp(amz / 2); prefix = sq * sq; } - else if(((std::min)(alz, amz)/4 > tools::log_min_value()) && ((std::max)(alz, amz)/4 < tools::log_max_value()) && (z > a)) + else if((BOOST_MATH_GPU_SAFE_MIN(alz, amz)/4 > tools::log_min_value()) && (BOOST_MATH_GPU_SAFE_MAX(alz, amz)/4 < tools::log_max_value()) && (z > a)) { // compute the 4th root of the result then square it twice: T sq = pow(z / agh, a / 4) * exp(amz / 4); @@ -1092,7 +1094,7 @@ BOOST_MATH_GPU_ENABLED inline T tgamma_small_upper_part(T a, T x, const Policy& result -= p; result /= a; detail::small_gamma2_series s(a, x); - std::uintmax_t max_iter = policies::get_max_series_iterations() - 10; + boost::math::uintmax_t max_iter = policies::get_max_series_iterations() - 10; p += 1; if(pderivative) *pderivative = p / (*pgam * exp(x)); @@ -1192,7 +1194,7 @@ BOOST_MATH_GPU_ENABLED T incomplete_tgamma_large_x(const T& a, const T& x, const { BOOST_MATH_STD_USING incomplete_tgamma_large_x_series s(a, x); - std::uintmax_t max_iter = boost::math::policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = boost::math::policies::get_max_series_iterations(); T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon(), max_iter); boost::math::policies::check_series_iterations("boost::math::tgamma<%1%>(%1%,%1%)", max_iter, pol); return result; @@ -1357,7 +1359,7 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in // series and continued fractions are slow to converge: // bool use_temme = false; - if(normalised && std::numeric_limits::is_specialized && (a > 20)) + if(normalised && boost::math::numeric_limits::is_specialized && (a > 20)) { T sigma = fabs((x-a)/a); if((a > 200) && (policies::digits() <= 113)) @@ -1507,7 +1509,7 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in // typedef typename policies::precision::type precision_type; - typedef std::integral_constant::type precision_type; - typedef std::integral_constant - BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { // If std::numeric_limits::digits is zero, we must not call // our initialization code here as the precision presumably // varies at runtime, and will not have been set yet. Plus the // code requiring initialization isn't called when digits == 0. - if (std::numeric_limits::digits) + if (boost::math::numeric_limits::digits) { boost::math::gamma_p(static_cast(400), static_cast(400), Policy()); } } - BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&){} + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&){} void force_instantiate()const{} }; BOOST_MATH_STATIC const init initializer; @@ -1945,7 +1947,7 @@ struct lgamma_initializer BOOST_MATH_GPU_ENABLED init() { typedef typename policies::precision::type precision_type; - typedef std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::lgamma(static_cast(2.5), Policy()); boost::math::lgamma(static_cast(1.25), Policy()); boost::math::lgamma(static_cast(1.75), Policy()); } - BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { boost::math::lgamma(static_cast(2.5), Policy()); boost::math::lgamma(static_cast(1.25), Policy()); boost::math::lgamma(static_cast(1.5), Policy()); boost::math::lgamma(static_cast(1.75), Policy()); } - BOOST_MATH_GPU_ENABLED static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { } BOOST_MATH_GPU_ENABLED void force_instantiate()const{} @@ -2079,7 +2081,7 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - return policies::checked_narrowing_cast::type, forwarding_policy>(detail::tgammap1m1_imp(static_cast(z), forwarding_policy(), evaluation_type()), "boost::math::tgamma1pm1<%!%>(%1%)"); + return policies::checked_narrowing_cast::type, forwarding_policy>(detail::tgammap1m1_imp(static_cast(z), forwarding_policy(), evaluation_type()), "boost::math::tgamma1pm1<%!%>(%1%)"); } template From e7fd6bf6cb041715c69e2644ad58a61d3926ad94 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 16:00:44 -0400 Subject: [PATCH 08/40] Remove NVRTC workaround --- .../boost/math/special_functions/gamma.hpp | 90 ++++--------------- 1 file changed, 18 insertions(+), 72 deletions(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index e5990ca4f..8a47d600f 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -15,9 +15,6 @@ #endif #include - -#ifndef BOOST_MATH_HAS_NVRTC - #include #include #include @@ -457,6 +454,8 @@ int minimum_argument_for_bernoulli_recursion() return min_arg; } +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + template T scaled_tgamma_no_lanczos(const T& z, const Policy& pol, bool islog = false) { @@ -777,6 +776,8 @@ T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sig return log_gamma_value; } +#endif // BOOST_MATH_HAS_GPU_SUPPORT + // // This helper calculates tgamma(dz+1)-1 without cancellation errors, // used by the upper incomplete gamma with z < 1: @@ -830,6 +831,8 @@ BOOST_MATH_GPU_ENABLED T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& return result; } +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + template inline T tgammap1m1_imp(T z, Policy const& pol, const ::boost::math::lanczos::undefined_lanczos&) @@ -843,6 +846,8 @@ inline T tgammap1m1_imp(T z, Policy const& pol, return boost::math::expm1(boost::math::lgamma(1 + z, pol)); } +#endif // BOOST_MATH_HAS_GPU_SUPPORT + // // Series representation for upper fraction when z is small: // @@ -1006,6 +1011,9 @@ BOOST_MATH_GPU_ENABLED T regularised_gamma_prefix(T a, T z, const Policy& pol, c prefix *= sqrt(agh / boost::math::constants::e()) / Lanczos::lanczos_sum_expG_scaled(a); return prefix; } + +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + // // And again, without Lanczos support: // @@ -1075,6 +1083,9 @@ T regularised_gamma_prefix(T a, T z, const Policy& pol, const lanczos::undefined } } } + +#endif // BOOST_MATH_HAS_GPU_SUPPORT + // // Upper gamma fraction for very small a: // @@ -1653,6 +1664,8 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Poli // // And again without Lanczos support this time: // +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + template T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const lanczos::undefined_lanczos& l) { @@ -1709,6 +1722,8 @@ T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const lanczos: return ratio; } +#endif + template BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol) { @@ -2286,73 +2301,4 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t #include #include -#else - -#include -#include - -namespace boost { -namespace math { - -inline BOOST_MATH_GPU_ENABLED float tgamma(float x) { return ::tgammaf(x); } -inline BOOST_MATH_GPU_ENABLED double tgamma(double x) { return ::tgamma(x); } - -template -BOOST_MATH_GPU_ENABLED T tgamma(T x, const Policy&) -{ - return boost::math::tgamma(x); -} - -inline BOOST_MATH_GPU_ENABLED float lgamma(float x) { return ::lgammaf(x); } -inline BOOST_MATH_GPU_ENABLED double lgamma(double x) { return ::lgamma(x); } - -template -BOOST_MATH_GPU_ENABLED T lgamma(T x, const Policy&) -{ - return boost::math::lgamma(x); -} - -template -BOOST_MATH_GPU_ENABLED T lgamma(T x, int* sign, const Policy&) -{ - auto res = boost::math::lgamma(x); - if (sign != nullptr) - { - if (res < 0) - { - *sign = -1; - } - else - { - *sign = 1; - } - } - - return res; -} - -template -BOOST_MATH_GPU_ENABLED T tgamma1pm1(T z) -{ - using namespace boost::math; - - if (fabs(z) < T(0.55)) - { - return expm1(lgamma(z)); - } - - return expm1(lgamma(1 + z)); -} - -template -BOOST_MATH_GPU_ENABLED T tgamma1pm1(T x, const Policy&) -{ - return tgamma1pm1(x); -} - -} // namespace math -} // namespace boost - -#endif // __CUDACC_RTC__ - #endif // BOOST_MATH_SF_GAMMA_HPP From c3fefdd9f58316ffc6d85f2854da34255c3836ed Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 16:03:16 -0400 Subject: [PATCH 09/40] Fix fraction use of STL headers --- include/boost/math/tools/fraction.hpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/include/boost/math/tools/fraction.hpp b/include/boost/math/tools/fraction.hpp index e5a31edf6..fd1fdb3bc 100644 --- a/include/boost/math/tools/fraction.hpp +++ b/include/boost/math/tools/fraction.hpp @@ -18,9 +18,6 @@ #include #include #include -#include -#include -#include namespace boost{ namespace math{ namespace tools{ @@ -67,7 +64,7 @@ namespace detail template struct fraction_traits - : public std::conditional< + : public boost::math::conditional< is_pair::value, fraction_traits_pair, fraction_traits_simple>::type @@ -183,7 +180,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type #endif ) { - boost::math::uintmax_t max_terms = (std::numeric_limits::max)(); + boost::math::uintmax_t max_terms = (boost::math::numeric_limits::max)(); return detail::continued_fraction_b_impl(g, factor, max_terms); } @@ -201,7 +198,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type using result_type = typename traits::result_type; result_type factor = ldexp(1.0f, 1 - bits); // 1 / pow(result_type(2), bits); - boost::math::uintmax_t max_terms = (std::numeric_limits::max)(); + boost::math::uintmax_t max_terms = (boost::math::numeric_limits::max)(); return detail::continued_fraction_b_impl(g, factor, max_terms); } @@ -310,7 +307,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type #endif ) { - boost::math::uintmax_t max_iter = (std::numeric_limits::max)(); + boost::math::uintmax_t max_iter = (boost::math::numeric_limits::max)(); return detail::continued_fraction_a_impl(g, factor, max_iter); } @@ -328,7 +325,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type typedef typename traits::result_type result_type; result_type factor = ldexp(1.0f, 1-bits); // 1 / pow(result_type(2), bits); - boost::math::uintmax_t max_iter = (std::numeric_limits::max)(); + boost::math::uintmax_t max_iter = (boost::math::numeric_limits::max)(); return detail::continued_fraction_a_impl(g, factor, max_iter); } From 1deb121586ea3c36eec41887a00fc649b71b3e77 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 16:04:20 -0400 Subject: [PATCH 10/40] Mark gamma functions in fwd --- .../boost/math/special_functions/math_fwd.hpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 21f51e507..16ae3b61e 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -512,28 +512,28 @@ namespace boost // gamma inverse. template - tools::promote_args_t gamma_p_inv(T1 a, T2 p); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_p_inv(T1 a, T2 p); template - tools::promote_args_t gamma_p_inva(T1 a, T2 p, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_p_inva(T1 a, T2 p, const Policy&); template - tools::promote_args_t gamma_p_inva(T1 a, T2 p); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_p_inva(T1 a, T2 p); template - tools::promote_args_t gamma_p_inv(T1 a, T2 p, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_p_inv(T1 a, T2 p, const Policy&); template - tools::promote_args_t gamma_q_inv(T1 a, T2 q); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_q_inv(T1 a, T2 q); template - tools::promote_args_t gamma_q_inv(T1 a, T2 q, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_q_inv(T1 a, T2 q, const Policy&); template - tools::promote_args_t gamma_q_inva(T1 a, T2 q); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_q_inva(T1 a, T2 q); template - tools::promote_args_t gamma_q_inva(T1 a, T2 q, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t gamma_q_inva(T1 a, T2 q, const Policy&); // digamma: template @@ -1447,16 +1447,16 @@ namespace boost BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t gamma_p_derivative(T1 a, T2 x){ return boost::math::gamma_p_derivative(a, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t gamma_p_inv(T1 a, T2 p){ return boost::math::gamma_p_inv(a, p, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t gamma_p_inv(T1 a, T2 p){ return boost::math::gamma_p_inv(a, p, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t gamma_p_inva(T1 a, T2 p){ return boost::math::gamma_p_inva(a, p, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t gamma_p_inva(T1 a, T2 p){ return boost::math::gamma_p_inva(a, p, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t gamma_q_inv(T1 a, T2 q){ return boost::math::gamma_q_inv(a, q, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t gamma_q_inv(T1 a, T2 q){ return boost::math::gamma_q_inv(a, q, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t gamma_q_inva(T1 a, T2 q){ return boost::math::gamma_q_inva(a, q, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t gamma_q_inva(T1 a, T2 q){ return boost::math::gamma_q_inva(a, q, Policy()); }\ \ template \ BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t digamma(T x){ return boost::math::digamma(x, Policy()); }\ From 5e8626f2c4ce25694d6c53fe6e4db217a4f7a9ee Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 16:07:13 -0400 Subject: [PATCH 11/40] Disable declval on all GPU platforms --- include/boost/math/tools/fraction.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/boost/math/tools/fraction.hpp b/include/boost/math/tools/fraction.hpp index fd1fdb3bc..f36d024c4 100644 --- a/include/boost/math/tools/fraction.hpp +++ b/include/boost/math/tools/fraction.hpp @@ -112,7 +112,7 @@ namespace detail { template BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_b_impl(Gen& g, const U& factor, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) - #ifndef BOOST_MATH_ENABLE_SYCL + #ifndef BOOST_MATH_HAS_GPU_SUPPORT // SYCL can not handle this condition so we only check float on that platform && noexcept(std::declval()()) #endif @@ -164,7 +164,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type template BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, const U& factor, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) - #ifndef BOOST_MATH_ENABLE_SYCL + #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()()) #endif ) @@ -175,7 +175,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type template BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, const U& factor) noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) - #ifndef BOOST_MATH_ENABLE_SYCL + #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()()) #endif ) @@ -187,7 +187,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type template BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) - #ifndef BOOST_MATH_ENABLE_SYCL + #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()()) #endif ) @@ -205,7 +205,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type template BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_b(Gen& g, int bits, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) - #ifndef BOOST_MATH_ENABLE_SYCL + #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()()) #endif ) @@ -238,7 +238,7 @@ namespace detail { template BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_a_impl(Gen& g, const U& factor, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) - #ifndef BOOST_MATH_ENABLE_SYCL + #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()()) #endif ) @@ -291,7 +291,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type template BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, const U& factor, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) - #ifndef BOOST_MATH_ENABLE_SYCL + #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()()) #endif ) @@ -302,7 +302,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type template BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, const U& factor) noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) - #ifndef BOOST_MATH_ENABLE_SYCL + #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()()) #endif ) @@ -314,7 +314,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type template BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) - #ifndef BOOST_MATH_ENABLE_SYCL + #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()()) #endif ) @@ -333,7 +333,7 @@ BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type template BOOST_MATH_GPU_ENABLED inline typename detail::fraction_traits::result_type continued_fraction_a(Gen& g, int bits, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits::result_type) - #ifndef BOOST_MATH_ENABLE_SYCL + #ifndef BOOST_MATH_HAS_GPU_SUPPORT && noexcept(std::declval()()) #endif ) From 8c118c54f6df7f8d1b90786640ce8e8a5857243d Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 16:07:33 -0400 Subject: [PATCH 12/40] Disable more unneeded code on device --- include/boost/math/special_functions/gamma.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 8a47d600f..2f7bb8e4a 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -405,6 +405,8 @@ BOOST_MATH_GPU_ENABLED inline T lower_gamma_series(T a, T z, const Policy& pol, return result; } +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + // // Fully generic tgamma and lgamma use Stirling's approximation // with Bernoulli numbers. @@ -454,8 +456,6 @@ int minimum_argument_for_bernoulli_recursion() return min_arg; } -#ifndef BOOST_MATH_HAS_GPU_SUPPORT - template T scaled_tgamma_no_lanczos(const T& z, const Policy& pol, bool islog = false) { @@ -1940,7 +1940,7 @@ struct igamma_initializer } } BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&){} - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; BOOST_MATH_STATIC const init initializer; BOOST_MATH_GPU_ENABLED static void force_instantiate() From e74868b9bd98bb9d774b253096eea1f24f0d264f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 16:22:18 -0400 Subject: [PATCH 13/40] Add forward decl for NVRTC tgamma --- .../special_functions/detail/igamma_large.hpp | 2 +- .../boost/math/special_functions/gamma.hpp | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/boost/math/special_functions/detail/igamma_large.hpp b/include/boost/math/special_functions/detail/igamma_large.hpp index 13790ca82..0b9586596 100644 --- a/include/boost/math/special_functions/detail/igamma_large.hpp +++ b/include/boost/math/special_functions/detail/igamma_large.hpp @@ -69,7 +69,7 @@ namespace boost{ namespace math{ namespace detail{ // when T is unsuitable to be passed to these routines: // template -inline T igamma_temme_large(T, T, const Policy& /* pol */, const boost::math::integral_constant&) +BOOST_MATH_GPU_ENABLED inline T igamma_temme_large(T, T, const Policy& /* pol */, const boost::math::integral_constant&) { // stub function, should never actually be called BOOST_MATH_ASSERT(0); diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 2f7bb8e4a..4b244afdf 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -207,7 +207,7 @@ BOOST_MATH_GPU_ENABLED BOOST_MATH_FORCEINLINE T gamma_imp(T z, const Policy& pol result = -boost::math::constants::pi() / result; if(result == 0) return policies::raise_underflow_error(function, "Result of tgamma is too small to represent.", pol); - if((boost::math::fpclassify)(result) == (int)FP_SUBNORMAL) + if((boost::math::fpclassify)(result) == (int)BOOST_MATH_FP_SUBNORMAL) return policies::raise_denorm_error(function, "Result of tgamma is denormalized.", result, pol); BOOST_MATH_INSTRUMENT_VARIABLE(result); return result; @@ -630,7 +630,7 @@ T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&) if(gamma_value == 0) return policies::raise_underflow_error(function, "Result of tgamma is too small to represent.", pol); - if((boost::math::fpclassify)(gamma_value) == static_cast(FP_SUBNORMAL)) + if((boost::math::fpclassify)(gamma_value) == static_cast(BOOST_MATH_FP_SUBNORMAL)) return policies::raise_denorm_error(function, "Result of tgamma is denormalized.", gamma_value, pol); } @@ -778,6 +778,19 @@ T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sig #endif // BOOST_MATH_HAS_GPU_SUPPORT +// In order for tgammap1m1_imp to compile we need a forward decl of boost::math::tgamma +// The rub is that we can't just use math_fwd so we provide one here only in that circumstance +#ifdef BOOST_MATH_HAS_NVRTC +template +BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma(RT z); + +template +BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma(RT1 a, RT2 z); + +template +BOOST_MATH_GPU_ENABLED tools::promote_args_t tgamma(RT1 a, RT2 z, const Policy& pol); +#endif + // // This helper calculates tgamma(dz+1)-1 without cancellation errors, // used by the upper incomplete gamma with z < 1: @@ -921,7 +934,7 @@ BOOST_MATH_GPU_ENABLED T full_igamma_prefix(T a, T z, const Policy& pol) // This error handling isn't very good: it happens after the fact // rather than before it... // - if((boost::math::fpclassify)(prefix) == (int)FP_INFINITE) + if((boost::math::fpclassify)(prefix) == (int)BOOST_MATH_FP_INFINITE) return policies::raise_overflow_error("boost::math::detail::full_igamma_prefix<%1%>(%1%, %1%)", "Result of incomplete gamma function is too large to represent.", pol); return prefix; From 0342f4878c547448f6f053fbdd9c34a6f36a217b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 08:48:48 -0400 Subject: [PATCH 14/40] Disable unneeded items for all GPU --- include/boost/math/special_functions/beta.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/boost/math/special_functions/beta.hpp b/include/boost/math/special_functions/beta.hpp index f0ac6318b..00b8e45bf 100644 --- a/include/boost/math/special_functions/beta.hpp +++ b/include/boost/math/special_functions/beta.hpp @@ -136,7 +136,7 @@ BOOST_MATH_GPU_ENABLED T beta_imp(T a, T b, const Lanczos&, const Policy& pol) // Generic implementation of Beta(a,b) without Lanczos approximation support // (Caution this is slow!!!): // -#ifndef BOOST_MATH_HAS_NVRTC +#ifndef BOOST_MATH_HAS_GPU_SUPPORT template BOOST_MATH_GPU_ENABLED T beta_imp(T a, T b, const lanczos::undefined_lanczos& l, const Policy& pol) { @@ -461,7 +461,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_power_terms(T a, // // This version is generic, slow, and does not use the Lanczos approximation. // -#ifndef BOOST_MATH_HAS_NVRTC +#ifndef BOOST_MATH_HAS_GPU_SUPPORT template BOOST_MATH_GPU_ENABLED T ibeta_power_terms(T a, T b, @@ -741,7 +741,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const Lanczos&, bool // // Incomplete Beta series again, this time without Lanczos support: // -#ifndef BOOST_MATH_HAS_NVRTC +#ifndef BOOST_MATH_HAS_GPU_SUPPORT template BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const boost::math::lanczos::undefined_lanczos& l, bool normalised, T* p_derivative, T y, const Policy& pol) { @@ -958,7 +958,7 @@ struct Pn_size #endif }; -#ifndef BOOST_MATH_HAS_NVRTC +#ifndef BOOST_MATH_HAS_GPU_SUPPORT template BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Policy& pol, bool normalised) { From 93963a4fad542c75fdd03886bf185d6a916613af Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 09:34:24 -0400 Subject: [PATCH 15/40] Change workaround for missing overloads --- include/boost/math/special_functions/gamma.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 4b244afdf..220af5574 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -814,7 +814,11 @@ BOOST_MATH_GPU_ENABLED T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& if(dz < T(-0.5)) { // Best method is simply to subtract 1 from tgamma: + #ifdef BOOST_MATH_HAS_NVRTC + result = ::tgamma(1+dz); + #else result = boost::math::tgamma(1+dz, pol) - 1; + #endif BOOST_MATH_INSTRUMENT_CODE(result); } else @@ -836,7 +840,11 @@ BOOST_MATH_GPU_ENABLED T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& else { // Best method is simply to subtract 1 from tgamma: + #ifdef BOOST_MATH_HAS_NVRTC + result = ::tgamma(1+dz); + #else result = boost::math::tgamma(1+dz, pol) - 1; + #endif BOOST_MATH_INSTRUMENT_CODE(result); } } From 47420a5bf9858aba136a13d1a67224fa0d36b0fe Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 09:34:45 -0400 Subject: [PATCH 16/40] Rearrange definition location --- include/boost/math/special_functions/gamma.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 220af5574..f794d2e46 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -2054,13 +2054,6 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t } // namespace detail -template -BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type - tgamma(T z) -{ - return tgamma(z, policies::policy<>()); -} - template BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type lgamma(T z, int* sign, const Policy&) @@ -2149,6 +2142,12 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t using result_type = tools::promote_args_t; return static_cast(detail::tgamma(a, z, pol, boost::math::false_type())); } +template +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type + tgamma(T z) +{ + return tgamma(z, policies::policy<>()); +} // // Full lower incomplete gamma: // From 77610af8eb8e747dc299792082edefab6d6e5221 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 09:35:32 -0400 Subject: [PATCH 17/40] Add include path to cuda now that workaround is removed --- test/test_gamma_nvrtc_double.cpp | 4 ++-- test/test_gamma_nvrtc_float.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_gamma_nvrtc_double.cpp b/test/test_gamma_nvrtc_double.cpp index 9302d1190..9fe293372 100644 --- a/test/test_gamma_nvrtc_double.cpp +++ b/test/test_gamma_nvrtc_double.cpp @@ -85,9 +85,9 @@ int main() nvrtcAddNameExpression(prog, "test_gamma_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN - const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/"}; + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; #else - const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/"}; + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; #endif // Compile the program diff --git a/test/test_gamma_nvrtc_float.cpp b/test/test_gamma_nvrtc_float.cpp index 931b4758f..26b02cac8 100644 --- a/test/test_gamma_nvrtc_float.cpp +++ b/test/test_gamma_nvrtc_float.cpp @@ -85,9 +85,9 @@ int main() nvrtcAddNameExpression(prog, "test_gamma_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN - const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/"}; + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; #else - const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/"}; + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; #endif // Compile the program From 3c280683ae9a3a5039a55ca5570473371d987aa7 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 10:18:48 -0400 Subject: [PATCH 18/40] Fix NVRTC incompatibility with recursion and forward decls --- .../boost/math/special_functions/gamma.hpp | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index f794d2e46..e1e6923d7 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -1646,7 +1646,18 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Poli } else { + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + return 1 / (z * ::tgammaf(z + delta)); + } + else + { + return 1 / (z * ::tgamma(z + delta)); + } + #else return 1 / (z * boost::math::tgamma(z + delta, pol)); + #endif } } T zgh = static_cast(z + T(Lanczos::g()) - constants::half()); @@ -1753,7 +1764,18 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol) if((z <= 0) || (z + delta <= 0)) { // This isn't very sophisticated, or accurate, but it does work: + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + return ::tgammaf(z) / ::tgammaf(z + delta); + } + else + { + return ::tgamma(z) / ::tgamma(z + delta); + } + #else return boost::math::tgamma(z, pol) / boost::math::tgamma(z + delta, pol); + #endif } if(floor(delta) == delta) @@ -1813,17 +1835,32 @@ BOOST_MATH_GPU_ENABLED T tgamma_ratio_imp(T x, T y, const Policy& pol) if((y <= 0) || (boost::math::isinf)(y)) return policies::raise_domain_error("boost::math::tgamma_ratio<%1%>(%1%, %1%)", "Gamma function ratios only implemented for positive arguments (got b=%1%).", y, pol); + // We don't need to worry about the denorm case on device + // And this has the added bonus of removing recursion + #ifndef BOOST_MATH_HAS_GPU_SUPPORT if(x <= tools::min_value()) { // Special case for denorms...Ugh. T shift = ldexp(T(1), tools::digits()); return shift * tgamma_ratio_imp(T(x * shift), y, pol); } + #endif if((x < max_factorial::value) && (y < max_factorial::value)) { // Rather than subtracting values, lets just call the gamma functions directly: + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + return ::tgammaf(x) / ::tgammaf(y); + } + else + { + return ::tgamma(x) / ::tgamma(y); + } + #else return boost::math::tgamma(x, pol) / boost::math::tgamma(y, pol); + #endif } T prefix = 1; if(x < 1) @@ -1839,12 +1876,35 @@ BOOST_MATH_GPU_ENABLED T tgamma_ratio_imp(T x, T y, const Policy& pol) y -= 1; prefix /= y; } + + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + return prefix * ::tgammaf(x) / ::tgammaf(y); + } + else + { + return prefix * ::tgamma(x) / ::tgamma(y); + } + #else return prefix * boost::math::tgamma(x, pol) / boost::math::tgamma(y, pol); + #endif } // // result is almost certainly going to underflow to zero, try logs just in case: // + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + return ::expf(::lgammaf(x) - ::lgammaf(y)); + } + else + { + return ::exp(::lgamma(x) - ::lgamma(y)); + } + #else return exp(boost::math::lgamma(x, pol) - boost::math::lgamma(y, pol)); + #endif } if(y < 1) { @@ -1859,17 +1919,44 @@ BOOST_MATH_GPU_ENABLED T tgamma_ratio_imp(T x, T y, const Policy& pol) x -= 1; prefix *= x; } + + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + return prefix * ::tgammaf(x) / ::tgammaf(y); + } + else + { + return prefix * ::tgamma(x) / ::tgamma(y); + } + #else return prefix * boost::math::tgamma(x, pol) / boost::math::tgamma(y, pol); + #endif } // // Result will almost certainly overflow, try logs just in case: // + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + return ::expf(::lgammaf(x) - ::lgammaf(y)); + } + else + { + return ::exp(::lgamma(x) - ::lgamma(y)); + } + #else return exp(boost::math::lgamma(x, pol) - boost::math::lgamma(y, pol)); + #endif } // // Regular case, x and y both large and similar in magnitude: // + #ifdef BOOST_MATH_HAS_NVRTC + return detail::tgamma_delta_ratio_imp(x, y - x, pol); + #else return boost::math::tgamma_delta_ratio(x, y - x, pol); + #endif } template From 50cfb762b81ac66b3b7c1068b5d539e07994640a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 10:19:07 -0400 Subject: [PATCH 19/40] Add tgamma_ratio CUDA and NVRTC testing --- test/cuda_jamfile | 2 + test/nvrtc_jamfile | 2 + test/test_tgamma_ratio_double.cu | 104 +++++++++++++ test/test_tgamma_ratio_float.cu | 104 +++++++++++++ test/test_tgamma_ratio_nvrtc_double.cpp | 190 ++++++++++++++++++++++++ test/test_tgamma_ratio_nvrtc_float.cpp | 190 ++++++++++++++++++++++++ 6 files changed, 592 insertions(+) create mode 100644 test/test_tgamma_ratio_double.cu create mode 100644 test/test_tgamma_ratio_float.cu create mode 100644 test/test_tgamma_ratio_nvrtc_double.cpp create mode 100644 test/test_tgamma_ratio_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index a061fe02a..7ec1f072c 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -154,6 +154,8 @@ run test_lgamma_double.cu ; run test_lgamma_float.cu ; run test_tgamma_double.cu ; run test_tgamma_float.cu ; +run test_tgamma_ratio_double.cu ; +run test_tgamma_ratio_float.cu ; run test_log1p_double.cu ; run test_log1p_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index de235822e..bce2e125f 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -148,6 +148,8 @@ run test_fpclassify_nvrtc_float.cpp ; run test_gamma_nvrtc_double.cpp ; run test_gamma_nvrtc_float.cpp ; +run test_tgamma_ratio_nvrtc_double.cpp ; +run test_tgamma_ratio_nvrtc_float.cpp ; run test_log1p_nvrtc_double.cpp ; run test_log1p_nvrtc_float.cpp ; diff --git a/test/test_tgamma_ratio_double.cu b/test/test_tgamma_ratio_double.cu new file mode 100644 index 000000000..059e1c3c6 --- /dev/null +++ b/test/test_tgamma_ratio_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::tgamma_ratio(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::tgamma_ratio(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_tgamma_ratio_float.cu b/test/test_tgamma_ratio_float.cu new file mode 100644 index 000000000..dc669bd7f --- /dev/null +++ b/test/test_tgamma_ratio_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::tgamma_ratio(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::tgamma_ratio(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_tgamma_ratio_nvrtc_double.cpp b/test/test_tgamma_ratio_nvrtc_double.cpp new file mode 100644 index 000000000..5b0c3b1e6 --- /dev/null +++ b/test/test_tgamma_ratio_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_tgamma_ratio_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::tgamma_ratio(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_tgamma_ratio_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_tgamma_ratio_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_tgamma_ratio_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::tgamma_ratio(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_tgamma_ratio_nvrtc_float.cpp b/test/test_tgamma_ratio_nvrtc_float.cpp new file mode 100644 index 000000000..5b0c3b1e6 --- /dev/null +++ b/test/test_tgamma_ratio_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_tgamma_ratio_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::tgamma_ratio(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_tgamma_ratio_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_tgamma_ratio_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_tgamma_ratio_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::tgamma_ratio(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 28bbccfe0d8a162c97cfb5fc336f0f90cf2a374e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 10:37:28 -0400 Subject: [PATCH 20/40] Fix NVRTC handling of gamma_p_derivative --- include/boost/math/special_functions/gamma.hpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index e1e6923d7..2709482e4 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -1991,7 +1991,18 @@ BOOST_MATH_GPU_ENABLED T gamma_p_derivative_imp(T a, T x, const Policy& pol) if(f1 == 0) { // Underflow in calculation, use logs instead: + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + f1 = a * ::logf(x) - x - ::lgammaf(a) - ::logf(x); + } + else + { + f1 = a * ::log(x) - x - ::lgamma(a) - ::log(x); + } + #else f1 = a * log(x) - x - lgamma(a, pol) - log(x); + #endif f1 = exp(f1); } else From c1ffaf49379e4508f9925b7983c8e8cd33e8edcd Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 10:37:45 -0400 Subject: [PATCH 21/40] Add gamma_p_derivative CUDA and NVRTC testing --- test/cuda_jamfile | 2 + test/nvrtc_jamfile | 2 + test/test_gamma_p_derivative_double.cu | 104 ++++++++++ test/test_gamma_p_derivative_float.cu | 104 ++++++++++ test/test_gamma_p_derivative_nvrtc_double.cpp | 190 ++++++++++++++++++ test/test_gamma_p_derivative_nvrtc_float.cpp | 190 ++++++++++++++++++ test/test_tgamma_ratio_nvrtc_float.cpp | 4 +- 7 files changed, 594 insertions(+), 2 deletions(-) create mode 100644 test/test_gamma_p_derivative_double.cu create mode 100644 test/test_gamma_p_derivative_float.cu create mode 100644 test/test_gamma_p_derivative_nvrtc_double.cpp create mode 100644 test/test_gamma_p_derivative_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 7ec1f072c..d75024409 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -156,6 +156,8 @@ run test_tgamma_double.cu ; run test_tgamma_float.cu ; run test_tgamma_ratio_double.cu ; run test_tgamma_ratio_float.cu ; +run test_gamma_p_derivative_double.cu ; +run test_gamma_p_derivative_float.cu ; run test_log1p_double.cu ; run test_log1p_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index bce2e125f..ab3d710cd 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -148,6 +148,8 @@ run test_fpclassify_nvrtc_float.cpp ; run test_gamma_nvrtc_double.cpp ; run test_gamma_nvrtc_float.cpp ; +run test_gamma_p_derivative_nvrtc_double.cpp ; +run test_gamma_p_derivative_nvrtc_float.cpp ; run test_tgamma_ratio_nvrtc_double.cpp ; run test_tgamma_ratio_nvrtc_float.cpp ; diff --git a/test/test_gamma_p_derivative_double.cu b/test/test_gamma_p_derivative_double.cu new file mode 100644 index 000000000..566bc1657 --- /dev/null +++ b/test/test_gamma_p_derivative_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::gamma_p_derivative(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::gamma_p_derivative(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_gamma_p_derivative_float.cu b/test/test_gamma_p_derivative_float.cu new file mode 100644 index 000000000..f9fd52a50 --- /dev/null +++ b/test/test_gamma_p_derivative_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::gamma_p_derivative(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::gamma_p_derivative(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_gamma_p_derivative_nvrtc_double.cpp b/test/test_gamma_p_derivative_nvrtc_double.cpp new file mode 100644 index 000000000..53a752c2d --- /dev/null +++ b/test/test_gamma_p_derivative_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_gamma_p_derivative_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::gamma_p_derivative(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_p_derivative_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_p_derivative_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_p_derivative_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::gamma_p_derivative(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_p_derivative_nvrtc_float.cpp b/test/test_gamma_p_derivative_nvrtc_float.cpp new file mode 100644 index 000000000..da9c50855 --- /dev/null +++ b/test/test_gamma_p_derivative_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_gamma_p_derivative_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::gamma_p_derivative(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_p_derivative_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_p_derivative_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_p_derivative_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::gamma_p_derivative(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_tgamma_ratio_nvrtc_float.cpp b/test/test_tgamma_ratio_nvrtc_float.cpp index 5b0c3b1e6..ab1bf339b 100644 --- a/test/test_tgamma_ratio_nvrtc_float.cpp +++ b/test/test_tgamma_ratio_nvrtc_float.cpp @@ -20,10 +20,10 @@ #include #include -typedef double float_type; +typedef float float_type; const char* cuda_kernel = R"( -typedef double float_type; +typedef float float_type; #include #include extern "C" __global__ From a17a7dbc7d72a4323394efd28182978393375af3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 11:27:31 -0400 Subject: [PATCH 22/40] Remove recursion from gamma_incomplete_imp --- .../boost/math/special_functions/gamma.hpp | 151 ++++++++++-------- 1 file changed, 86 insertions(+), 65 deletions(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 2709482e4..c9f1610a8 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -1237,7 +1237,7 @@ BOOST_MATH_GPU_ENABLED T incomplete_tgamma_large_x(const T& a, const T& x, const // Main incomplete gamma entry point, handles all four incomplete gamma's: // template -BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool invert, +BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp_final(T a, T x, bool normalised, bool invert, const Policy& pol, T* p_derivative) { constexpr auto function = "boost::math::gamma_p<%1%>(%1%, %1%)"; @@ -1252,70 +1252,6 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in T result = 0; // Just to avoid warning C4701: potentially uninitialized local variable 'result' used - if(a >= max_factorial::value && !normalised) - { - // - // When we're computing the non-normalized incomplete gamma - // and a is large the result is rather hard to compute unless - // we use logs. There are really two options - if x is a long - // way from a in value then we can reliably use methods 2 and 4 - // below in logarithmic form and go straight to the result. - // Otherwise we let the regularized gamma take the strain - // (the result is unlikely to underflow in the central region anyway) - // and combine with lgamma in the hopes that we get a finite result. - // - if(invert && (a * 4 < x)) - { - // This is method 4 below, done in logs: - result = a * log(x) - x; - if(p_derivative) - *p_derivative = exp(result); - result += log(upper_gamma_fraction(a, x, policies::get_epsilon())); - } - else if(!invert && (a > 4 * x)) - { - // This is method 2 below, done in logs: - result = a * log(x) - x; - if(p_derivative) - *p_derivative = exp(result); - T init_value = 0; - result += log(detail::lower_gamma_series(a, x, pol, init_value) / a); - } - else - { - result = gamma_incomplete_imp(a, x, true, invert, pol, p_derivative); - if(result == 0) - { - if(invert) - { - // Try http://functions.wolfram.com/06.06.06.0039.01 - result = 1 + 1 / (12 * a) + 1 / (288 * a * a); - result = log(result) - a + (a - 0.5f) * log(a) + log(boost::math::constants::root_two_pi()); - if(p_derivative) - *p_derivative = exp(a * log(x) - x); - } - else - { - // This is method 2 below, done in logs, we're really outside the - // range of this method, but since the result is almost certainly - // infinite, we should probably be OK: - result = a * log(x) - x; - if(p_derivative) - *p_derivative = exp(result); - T init_value = 0; - result += log(detail::lower_gamma_series(a, x, pol, init_value) / a); - } - } - else - { - result = log(result) + boost::math::lgamma(a, pol); - } - } - if(result > tools::log_max_value()) - return policies::raise_overflow_error(function, nullptr, pol); - return exp(result); - } - BOOST_MATH_ASSERT((p_derivative == nullptr) || normalised); bool is_int, is_half_int; @@ -1619,6 +1555,91 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in return result; } +// Need to implement this dispatch to avoid recursion for device compilers +template +BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool invert, + const Policy& pol, T* p_derivative) +{ + constexpr auto function = "boost::math::gamma_p<%1%>(%1%, %1%)"; + if(a <= 0) + return policies::raise_domain_error(function, "Argument a to the incomplete gamma function must be greater than zero (got a=%1%).", a, pol); + if(x < 0) + return policies::raise_domain_error(function, "Argument x to the incomplete gamma function must be >= 0 (got x=%1%).", x, pol); + + BOOST_MATH_STD_USING + + typedef typename lanczos::lanczos::type lanczos_type; + + T result = 0; // Just to avoid warning C4701: potentially uninitialized local variable 'result' used + + if(a >= max_factorial::value && !normalised) + { + // + // When we're computing the non-normalized incomplete gamma + // and a is large the result is rather hard to compute unless + // we use logs. There are really two options - if x is a long + // way from a in value then we can reliably use methods 2 and 4 + // below in logarithmic form and go straight to the result. + // Otherwise we let the regularized gamma take the strain + // (the result is unlikely to underflow in the central region anyway) + // and combine with lgamma in the hopes that we get a finite result. + // + if(invert && (a * 4 < x)) + { + // This is method 4 below, done in logs: + result = a * log(x) - x; + if(p_derivative) + *p_derivative = exp(result); + result += log(upper_gamma_fraction(a, x, policies::get_epsilon())); + } + else if(!invert && (a > 4 * x)) + { + // This is method 2 below, done in logs: + result = a * log(x) - x; + if(p_derivative) + *p_derivative = exp(result); + T init_value = 0; + result += log(detail::lower_gamma_series(a, x, pol, init_value) / a); + } + else + { + result = gamma_incomplete_imp_final(T(a), T(x), true, invert, pol, p_derivative); + if(result == 0) + { + if(invert) + { + // Try http://functions.wolfram.com/06.06.06.0039.01 + result = 1 + 1 / (12 * a) + 1 / (288 * a * a); + result = log(result) - a + (a - 0.5f) * log(a) + log(boost::math::constants::root_two_pi()); + if(p_derivative) + *p_derivative = exp(a * log(x) - x); + } + else + { + // This is method 2 below, done in logs, we're really outside the + // range of this method, but since the result is almost certainly + // infinite, we should probably be OK: + result = a * log(x) - x; + if(p_derivative) + *p_derivative = exp(result); + T init_value = 0; + result += log(detail::lower_gamma_series(a, x, pol, init_value) / a); + } + } + else + { + result = log(result) + boost::math::lgamma(a, pol); + } + } + if(result > tools::log_max_value()) + return policies::raise_overflow_error(function, nullptr, pol); + return exp(result); + } + + // If no special handling is required then we proceeds as normal + return gamma_incomplete_imp_final(T(a), T(x), normalised, invert, pol, p_derivative); +} + // // Ratios of two gamma functions: // From 8914b43fdb0c63e5841c37df668cb7b158ee672e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 11:27:47 -0400 Subject: [PATCH 23/40] Add SYCL testing of igamma, igamma_inv, and igamma_inva --- test/sycl_jamfile | 14 ++++++++++++++ test/test_igamma.cpp | 3 +++ test/test_igamma.hpp | 3 ++- test/test_igamma_inv.cpp | 3 +++ test/test_igamma_inv.hpp | 3 ++- test/test_igamma_inva.cpp | 3 +++ test/test_igamma_inva.hpp | 3 ++- 7 files changed, 29 insertions(+), 3 deletions(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 97c48474c..009e172d7 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -24,17 +24,31 @@ run test_saspoint5.cpp ; # Special Functions run pow_test.cpp ; + run test_beta_simple.cpp ; + run test_bessel_i.cpp ; run test_bessel_j.cpp ; run test_bessel_k.cpp ; run test_bessel_y.cpp ; + run test_cbrt.cpp ; + run test_sign.cpp ; + run test_round.cpp ; + run test_expm1_simple.cpp ; + run test_log1p_simple.cpp ; + run test_digamma_simple.cpp ; + run test_trigamma.cpp ; + run test_erf.cpp ; + run test_gamma.cpp ; +run test_igamma.cpp ; +run test_igamma_inv.cpp ; +run test_igamma_inva.cpp ; diff --git a/test/test_igamma.cpp b/test/test_igamma.cpp index 8e80c772c..693d78e8e 100644 --- a/test/test_igamma.cpp +++ b/test/test_igamma.cpp @@ -3,7 +3,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_igamma.hpp" // diff --git a/test/test_igamma.hpp b/test/test_igamma.hpp index b434f727e..bfe386d4d 100644 --- a/test/test_igamma.hpp +++ b/test/test_igamma.hpp @@ -8,11 +8,12 @@ #include #include +#include #define BOOST_TEST_MAIN #include #include +#include "../include_private/boost/math/tools/test.hpp" #include -#include #include #include #include diff --git a/test/test_igamma_inv.cpp b/test/test_igamma_inv.cpp index eafed0e1d..55893135c 100644 --- a/test/test_igamma_inv.cpp +++ b/test/test_igamma_inv.cpp @@ -3,7 +3,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_igamma_inv.hpp" #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT) diff --git a/test/test_igamma_inv.hpp b/test/test_igamma_inv.hpp index 7330e918a..cf481537e 100644 --- a/test/test_igamma_inv.hpp +++ b/test/test_igamma_inv.hpp @@ -6,13 +6,14 @@ #include #include +#include #define BOOST_TEST_MAIN #include #include #include #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include diff --git a/test/test_igamma_inva.cpp b/test/test_igamma_inva.cpp index 047df1173..1ebc0ff5f 100644 --- a/test/test_igamma_inva.cpp +++ b/test/test_igamma_inva.cpp @@ -3,7 +3,10 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + #include "test_igamma_inva.hpp" #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT) diff --git a/test/test_igamma_inva.hpp b/test/test_igamma_inva.hpp index 402ea2f8b..d9d317da1 100644 --- a/test/test_igamma_inva.hpp +++ b/test/test_igamma_inva.hpp @@ -8,13 +8,14 @@ #include #include +#include #define BOOST_TEST_MAIN #include #include #include #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include From 009b62ecd479209936ec4744207471f5a962b7ed Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 11:35:23 -0400 Subject: [PATCH 24/40] Ignore literal-range warnings --- test/test_bessel_i.cpp | 8 ++++++++ test/test_bessel_j.cpp | 8 ++++++++ test/test_bessel_k.cpp | 8 ++++++++ test/test_bessel_y.cpp | 8 ++++++++ test/test_holtsmark.cpp | 8 ++++++++ test/test_igamma.cpp | 8 ++++++++ test/test_igamma_inv.cpp | 8 ++++++++ test/test_igamma_inva.cpp | 8 ++++++++ test/test_landau.cpp | 8 ++++++++ test/test_round.cpp | 5 +++++ 10 files changed, 77 insertions(+) diff --git a/test/test_bessel_i.cpp b/test/test_bessel_i.cpp index 70aac91e4..09487ddf1 100644 --- a/test/test_bessel_i.cpp +++ b/test/test_bessel_i.cpp @@ -10,6 +10,14 @@ #include #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" +#endif + #include "test_bessel_i.hpp" // diff --git a/test/test_bessel_j.cpp b/test/test_bessel_j.cpp index 516e34c29..31a64bc57 100644 --- a/test/test_bessel_j.cpp +++ b/test/test_bessel_j.cpp @@ -10,6 +10,14 @@ #include #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" +#endif + #include "test_bessel_j.hpp" // diff --git a/test/test_bessel_k.cpp b/test/test_bessel_k.cpp index d4ab7721f..84ba0830f 100644 --- a/test/test_bessel_k.cpp +++ b/test/test_bessel_k.cpp @@ -17,6 +17,14 @@ // Constants are too big for float case, but this doesn't matter for test. #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" +#endif + #include "test_bessel_k.hpp" // diff --git a/test/test_bessel_y.cpp b/test/test_bessel_y.cpp index 0bbefba55..232a90396 100644 --- a/test/test_bessel_y.cpp +++ b/test/test_bessel_y.cpp @@ -10,6 +10,14 @@ #include #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" +#endif + #include "test_bessel_y.hpp" // diff --git a/test/test_holtsmark.cpp b/test/test_holtsmark.cpp index 6aa357786..475f5400a 100644 --- a/test/test_holtsmark.cpp +++ b/test/test_holtsmark.cpp @@ -16,6 +16,14 @@ # include #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" +#endif + using boost::math::holtsmark_distribution; #ifndef BOOST_MATH_HAS_GPU_SUPPORT diff --git a/test/test_igamma.cpp b/test/test_igamma.cpp index 693d78e8e..0ad701996 100644 --- a/test/test_igamma.cpp +++ b/test/test_igamma.cpp @@ -7,6 +7,14 @@ #include #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" +#endif + #include "test_igamma.hpp" // diff --git a/test/test_igamma_inv.cpp b/test/test_igamma_inv.cpp index 55893135c..de9b19d1c 100644 --- a/test/test_igamma_inv.cpp +++ b/test/test_igamma_inv.cpp @@ -7,6 +7,14 @@ #include #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" +#endif + #include "test_igamma_inv.hpp" #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT) diff --git a/test/test_igamma_inva.cpp b/test/test_igamma_inva.cpp index 1ebc0ff5f..8d0e96596 100644 --- a/test/test_igamma_inva.cpp +++ b/test/test_igamma_inva.cpp @@ -7,6 +7,14 @@ #include #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" +#endif + #include "test_igamma_inva.hpp" #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT) diff --git a/test/test_landau.cpp b/test/test_landau.cpp index 297589bc0..1625b2177 100644 --- a/test/test_landau.cpp +++ b/test/test_landau.cpp @@ -15,6 +15,14 @@ # include #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wliteral-range" +#endif + using boost::math::landau_distribution; #ifndef BOOST_MATH_HAS_GPU_SUPPORT diff --git a/test/test_round.cpp b/test/test_round.cpp index e363efd56..e603aa510 100644 --- a/test/test_round.cpp +++ b/test/test_round.cpp @@ -7,6 +7,11 @@ #include #endif +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wimplicit-const-int-float-conversion" +#endif + #include #define BOOST_TEST_MAIN #include From 13c3df74bdf899649d82cd4d4e87f26a27e885f2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 11:37:08 -0400 Subject: [PATCH 25/40] Remove use of static const char* for function name --- .../boost/math/special_functions/detail/igamma_inverse.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/boost/math/special_functions/detail/igamma_inverse.hpp b/include/boost/math/special_functions/detail/igamma_inverse.hpp index fae36dddd..4efd4f78a 100644 --- a/include/boost/math/special_functions/detail/igamma_inverse.hpp +++ b/include/boost/math/special_functions/detail/igamma_inverse.hpp @@ -402,7 +402,7 @@ BOOST_MATH_GPU_ENABLED T gamma_p_inv_imp(T a, T p, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std functions. - BOOST_MATH_STATIC const char* function = "boost::math::gamma_p_inv<%1%>(%1%, %1%)"; + constexpr auto function = "boost::math::gamma_p_inv<%1%>(%1%, %1%)"; BOOST_MATH_INSTRUMENT_VARIABLE(a); BOOST_MATH_INSTRUMENT_VARIABLE(p); @@ -477,7 +477,7 @@ BOOST_MATH_GPU_ENABLED T gamma_q_inv_imp(T a, T q, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std functions. - BOOST_MATH_STATIC const char* function = "boost::math::gamma_q_inv<%1%>(%1%, %1%)"; + constexpr auto function = "boost::math::gamma_q_inv<%1%>(%1%, %1%)"; if(a <= 0) return policies::raise_domain_error(function, "Argument a in the incomplete gamma function inverse must be >= 0 (got a=%1%).", a, pol); From e9f40eaa8e055fa3dd70128a07d20419cbdfd80b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 11:53:31 -0400 Subject: [PATCH 26/40] Fix missing CUDA header --- include/boost/math/tools/config.hpp | 2 ++ include/boost/math/tools/tuple.hpp | 1 + 2 files changed, 3 insertions(+) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 1f444c004..a2af12763 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -677,6 +677,7 @@ namespace boost{ namespace math{ #include #include #include +#include # define BOOST_MATH_CUDA_ENABLED __host__ __device__ # define BOOST_MATH_HAS_GPU_SUPPORT @@ -802,6 +803,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return #define BOOST_MATH_IF_CONSTEXPR if constexpr #define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point::value) #define BOOST_MATH_CONSTEXPR_TABLE_FUNCTION constexpr +#define BOOST_MATH_NO_EXCEPTIONS // This should be defined to nothing but since it is not specifically a math macro // we need to undef before proceeding diff --git a/include/boost/math/tools/tuple.hpp b/include/boost/math/tools/tuple.hpp index 6a949d5e5..82d23b8d7 100644 --- a/include/boost/math/tools/tuple.hpp +++ b/include/boost/math/tools/tuple.hpp @@ -13,6 +13,7 @@ #include #include +#include namespace boost { namespace math { From 930e551f55140ce4daec0f0706d52ec857fcc251 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 11:53:48 -0400 Subject: [PATCH 27/40] Remove calls under NVRTC to fwd decl --- .../boost/math/special_functions/gamma.hpp | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index c9f1610a8..90c3bd044 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -1384,14 +1384,40 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp_final(T a, T x, bool normalised, b { result = finite_gamma_q(a, x, pol, p_derivative); if(!normalised) + { + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + result *= ::tgammaf(a); + } + else + { + result *= ::tgamma(a); + } + #else result *= boost::math::tgamma(a, pol); + #endif + } break; } case 1: { result = finite_half_gamma_q(a, x, p_derivative, pol); if(!normalised) + { + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + result *= ::tgammaf(a); + } + else + { + result *= ::tgamma(a); + } + #else result *= boost::math::tgamma(a, pol); + #endif + } if(p_derivative && (*p_derivative == 0)) *p_derivative = regularised_gamma_prefix(a, x, pol, lanczos_type()); break; @@ -1420,7 +1446,19 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp_final(T a, T x, bool normalised, b bool optimised_invert = false; if(invert) { + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + init_value = (normalised ? 1 : ::tgammaf(a)); + } + else + { + init_value = (normalised ? 1 : ::tgamma(a)); + } + #else init_value = (normalised ? 1 : boost::math::tgamma(a, pol)); + #endif + if(normalised || (result >= 1) || (tools::max_value() * result > init_value)) { init_value /= result; @@ -1503,7 +1541,18 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp_final(T a, T x, bool normalised, b try { #endif + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + result = ::powf(x, a) / ::tgammaf(a + 1); + } + else + { + result = ::pow(x, a) / ::tgamma(a + 1); + } + #else result = pow(x, a) / boost::math::tgamma(a + 1, pol); + #endif #ifndef BOOST_MATH_NO_EXCEPTIONS } catch (const std::overflow_error&) @@ -1535,7 +1584,19 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp_final(T a, T x, bool normalised, b result = 1; if(invert) { + #ifdef BOOST_MATH_HAS_NVRTC + T gam; + if (boost::math::is_same_v) + { + gam = normalised ? 1 : ::tgammaf(a); + } + else + { + gam = normalised ? 1 : ::tgamma(a); + } + #else T gam = normalised ? 1 : boost::math::tgamma(a, pol); + #endif result = gam - result; } if(p_derivative) @@ -1628,7 +1689,18 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in } else { + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + result = ::logf(result) + ::lgammaf(a); + } + else + { + result = ::log(result) + ::lgamma(a); + } + #else result = log(result) + boost::math::lgamma(a, pol); + #endif } } if(result > tools::log_max_value()) From 4cf03c5855c71f5b21ffb1fac3e6f3d3c2761bec Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 12:13:30 -0400 Subject: [PATCH 28/40] Add more nvrtc workarounds --- .../boost/math/special_functions/gamma.hpp | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 90c3bd044..186befb61 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -1118,7 +1118,14 @@ BOOST_MATH_GPU_ENABLED inline T tgamma_small_upper_part(T a, T x, const Policy& // Compute the full upper fraction (Q) when a is very small: // + #ifdef BOOST_MATH_HAS_NVRTC + typedef typename tools::promote_args::type result_type; + typedef typename policies::evaluation::type value_type; + typedef typename lanczos::lanczos::type evaluation_type; + T result {detail::tgammap1m1_imp(static_cast(a), pol, evaluation_type())}; + #else T result { boost::math::tgamma1pm1(a, pol) }; + #endif if(pgam) *pgam = (result + 1) / a; @@ -1175,7 +1182,21 @@ BOOST_MATH_GPU_ENABLED T finite_half_gamma_q(T a, T x, T* p_derivative, const Po // Calculates normalised Q when a is a half-integer: // BOOST_MATH_STD_USING + + #ifdef BOOST_MATH_HAS_NVRTC + T e; + if (boost::math::is_same_v) + { + e = ::erfcf(::sqrtf(x)); + } + else + { + e = ::erfc(::sqrt(x)); + } + #else T e = boost::math::erfc(sqrt(x), pol); + #endif + if((e != 0) && (a > 1)) { T term = exp(-x) / sqrt(constants::pi() * x); @@ -1629,7 +1650,6 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in BOOST_MATH_STD_USING - typedef typename lanczos::lanczos::type lanczos_type; T result = 0; // Just to avoid warning C4701: potentially uninitialized local variable 'result' used From a0cfbd4971fe48b83dd306ecb085cf5f67603d05 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 12:13:43 -0400 Subject: [PATCH 29/40] Use builtin erfc instead of header cycle --- .../special_functions/detail/igamma_large.hpp | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/boost/math/special_functions/detail/igamma_large.hpp b/include/boost/math/special_functions/detail/igamma_large.hpp index 0b9586596..8e0ad1b0d 100644 --- a/include/boost/math/special_functions/detail/igamma_large.hpp +++ b/include/boost/math/special_functions/detail/igamma_large.hpp @@ -424,7 +424,18 @@ BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const b if(x < a) result = -result; + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + result += ::erfcf(::sqrtf(y)) / 2; + } + else + { + result += ::erfc(::sqrt(y)) / 2; + } + #else result += boost::math::erfc(sqrt(y), pol) / 2; + #endif return result; } @@ -477,7 +488,18 @@ BOOST_MATH_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, const b if(x < a) result = -result; + #ifdef BOOST_MATH_HAS_NVRTC + if (boost::math::is_same_v) + { + result += ::erfcf(::sqrtf(y)) / 2; + } + else + { + result += ::erfc(::sqrt(y)) / 2; + } + #else result += boost::math::erfc(sqrt(y), pol) / 2; + #endif return result; } From 1d6b97ffc9f589603c4d1ee633a4402609bb56f4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 12:13:56 -0400 Subject: [PATCH 30/40] Add CUDA and NVRTC testing of gamma_p_inv --- test/cuda_jamfile | 2 + test/nvrtc_jamfile | 2 + test/test_gamma_p_inv_double.cu | 107 ++++++++++++++ test/test_gamma_p_inv_float.cu | 107 ++++++++++++++ test/test_gamma_p_inv_nvrtc_double.cpp | 190 +++++++++++++++++++++++++ test/test_gamma_p_inv_nvrtc_float.cpp | 190 +++++++++++++++++++++++++ 6 files changed, 598 insertions(+) create mode 100644 test/test_gamma_p_inv_double.cu create mode 100644 test/test_gamma_p_inv_float.cu create mode 100644 test/test_gamma_p_inv_nvrtc_double.cpp create mode 100644 test/test_gamma_p_inv_nvrtc_float.cpp diff --git a/test/cuda_jamfile b/test/cuda_jamfile index d75024409..5fc9eaff3 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -158,6 +158,8 @@ run test_tgamma_ratio_double.cu ; run test_tgamma_ratio_float.cu ; run test_gamma_p_derivative_double.cu ; run test_gamma_p_derivative_float.cu ; +run test_gamma_p_inv_double.cu ; +run test_gamma_p_inv_float.cu ; run test_log1p_double.cu ; run test_log1p_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index ab3d710cd..06d288160 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -150,6 +150,8 @@ run test_gamma_nvrtc_double.cpp ; run test_gamma_nvrtc_float.cpp ; run test_gamma_p_derivative_nvrtc_double.cpp ; run test_gamma_p_derivative_nvrtc_float.cpp ; +run test_gamma_p_inv_nvrtc_double.cpp ; +run test_gamma_p_inv_nvrtc_float.cpp ; run test_tgamma_ratio_nvrtc_double.cpp ; run test_tgamma_ratio_nvrtc_float.cpp ; diff --git a/test/test_gamma_p_inv_double.cu b/test/test_gamma_p_inv_double.cu new file mode 100644 index 000000000..202ed161a --- /dev/null +++ b/test/test_gamma_p_inv_double.cu @@ -0,0 +1,107 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::gamma_p_inv(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 gen(42); + std::uniform_real_distribution dist(0, 1); + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + input_vector2[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::gamma_p_inv(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_gamma_p_inv_float.cu b/test/test_gamma_p_inv_float.cu new file mode 100644 index 000000000..d12c22960 --- /dev/null +++ b/test/test_gamma_p_inv_float.cu @@ -0,0 +1,107 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::gamma_p_inv(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 gen(42); + std::uniform_real_distribution dist(0, 1); + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + input_vector2[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::gamma_p_inv(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_gamma_p_inv_nvrtc_double.cpp b/test/test_gamma_p_inv_nvrtc_double.cpp new file mode 100644 index 000000000..d270dbf90 --- /dev/null +++ b/test/test_gamma_p_inv_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_gamma_p_inv_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::gamma_p_inv(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_p_inv_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_p_inv_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_p_inv_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::gamma_p_inv(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_p_inv_nvrtc_float.cpp b/test/test_gamma_p_inv_nvrtc_float.cpp new file mode 100644 index 000000000..7c844eb68 --- /dev/null +++ b/test/test_gamma_p_inv_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_gamma_p_inv_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::gamma_p_inv(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_p_inv_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_p_inv_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_p_inv_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::gamma_p_inv(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 51f997578221d70ad9e299e0b0d999fad0beda92 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 13:35:54 -0400 Subject: [PATCH 31/40] Adjust tolerances --- test/test_gamma_p_inv_double.cu | 5 +++-- test/test_gamma_p_inv_float.cu | 2 +- test/test_igamma_inv.cpp | 8 ++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/test/test_gamma_p_inv_double.cu b/test/test_gamma_p_inv_double.cu index 202ed161a..4392f37d3 100644 --- a/test/test_gamma_p_inv_double.cu +++ b/test/test_gamma_p_inv_double.cu @@ -93,9 +93,10 @@ int main(void) // check the results for(int i = 0; i < numElements; ++i) { - if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000) { - std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Result verification failed at element " << i << "!\n" + << "Error found was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; return EXIT_FAILURE; } } diff --git a/test/test_gamma_p_inv_float.cu b/test/test_gamma_p_inv_float.cu index d12c22960..70033686c 100644 --- a/test/test_gamma_p_inv_float.cu +++ b/test/test_gamma_p_inv_float.cu @@ -93,7 +93,7 @@ int main(void) // check the results for(int i = 0; i < numElements; ++i) { - if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100) { std::cerr << "Result verification failed at element " << i << "!" << std::endl; return EXIT_FAILURE; diff --git a/test/test_igamma_inv.cpp b/test/test_igamma_inv.cpp index de9b19d1c..a88c3666b 100644 --- a/test/test_igamma_inv.cpp +++ b/test/test_igamma_inv.cpp @@ -100,14 +100,22 @@ void expected_results() "linux.*", // platform largest_type, // test type(s) "[^|]*medium[^|]*", // test data group + #ifdef SYCL_LANGUAGE_VERSION + "[^|]*", 350, 50); + #else "[^|]*", 350, 5); // test function + #endif add_expected_result( "[^|]*", // compiler "[^|]*", // stdlib "linux.*", // platform largest_type, // test type(s) "[^|]*large[^|]*", // test data group + #ifdef SYCL_LANGUAGE_VERSION + "[^|]*", 150, 20); // test function + #else "[^|]*", 150, 5); // test function + #endif // From e218f32bb1307736e130464f8d1509d64847fdca Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 13:59:41 -0400 Subject: [PATCH 32/40] Add GPU support to chi squared dist --- .../boost/math/distributions/chi_squared.hpp | 71 ++++++++++--------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/include/boost/math/distributions/chi_squared.hpp b/include/boost/math/distributions/chi_squared.hpp index f5daddc0a..3944569e8 100644 --- a/include/boost/math/distributions/chi_squared.hpp +++ b/include/boost/math/distributions/chi_squared.hpp @@ -9,14 +9,17 @@ #ifndef BOOST_MATH_DISTRIBUTIONS_CHI_SQUARED_HPP #define BOOST_MATH_DISTRIBUTIONS_CHI_SQUARED_HPP +#include +#include +#include +#include +#include #include #include // for incomplete beta. #include // complements #include // error checks #include -#include - namespace boost{ namespace math{ template > @@ -26,20 +29,20 @@ class chi_squared_distribution using value_type = RealType; using policy_type = Policy; - explicit chi_squared_distribution(RealType i) : m_df(i) + BOOST_MATH_GPU_ENABLED explicit chi_squared_distribution(RealType i) : m_df(i) { RealType result; detail::check_df( "boost::math::chi_squared_distribution<%1%>::chi_squared_distribution", m_df, &result, Policy()); } // chi_squared_distribution - RealType degrees_of_freedom()const + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const { return m_df; } // Parameter estimation: - static RealType find_degrees_of_freedom( + BOOST_MATH_GPU_ENABLED static RealType find_degrees_of_freedom( RealType difference_from_variance, RealType alpha, RealType beta, @@ -66,16 +69,16 @@ chi_squared_distribution(RealType)->chi_squared_distribution -inline std::pair range(const chi_squared_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline boost::math::pair range(const chi_squared_distribution& /*dist*/) { // Range of permissible values for random variable x. - if (std::numeric_limits::has_infinity) + BOOST_MATH_IF_CONSTEXPR (boost::math::numeric_limits::has_infinity) { - return std::pair(static_cast(0), std::numeric_limits::infinity()); // 0 to + infinity. + return boost::math::pair(static_cast(0), boost::math::numeric_limits::infinity()); // 0 to + infinity. } else { using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); // 0 to + max. + return boost::math::pair(static_cast(0), max_value()); // 0 to + max. } } @@ -84,21 +87,21 @@ inline std::pair range(const chi_squared_distribution -inline std::pair support(const chi_squared_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline boost::math::pair support(const chi_squared_distribution& /*dist*/) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. - return std::pair(static_cast(0), tools::max_value()); // 0 to + infinity. + return boost::math::pair(static_cast(0), tools::max_value()); // 0 to + infinity. } template -RealType pdf(const chi_squared_distribution& dist, const RealType& chi_square) +BOOST_MATH_GPU_ENABLED RealType pdf(const chi_squared_distribution& dist, const RealType& chi_square) { BOOST_MATH_STD_USING // for ADL of std functions RealType degrees_of_freedom = dist.degrees_of_freedom(); // Error check: RealType error_result; - static const char* function = "boost::math::pdf(const chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::pdf(const chi_squared_distribution<%1%>&, %1%)"; if(false == detail::check_df( function, degrees_of_freedom, &error_result, Policy())) @@ -132,12 +135,12 @@ RealType pdf(const chi_squared_distribution& dist, const RealT } // pdf template -inline RealType cdf(const chi_squared_distribution& dist, const RealType& chi_square) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const chi_squared_distribution& dist, const RealType& chi_square) { RealType degrees_of_freedom = dist.degrees_of_freedom(); // Error check: RealType error_result; - static const char* function = "boost::math::cdf(const chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const chi_squared_distribution<%1%>&, %1%)"; if(false == detail::check_df( function, degrees_of_freedom, &error_result, Policy())) @@ -153,10 +156,10 @@ inline RealType cdf(const chi_squared_distribution& dist, cons } // cdf template -inline RealType quantile(const chi_squared_distribution& dist, const RealType& p) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const chi_squared_distribution& dist, const RealType& p) { RealType degrees_of_freedom = dist.degrees_of_freedom(); - static const char* function = "boost::math::quantile(const chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const chi_squared_distribution<%1%>&, %1%)"; // Error check: RealType error_result; if(false == @@ -170,11 +173,11 @@ inline RealType quantile(const chi_squared_distribution& dist, } // quantile template -inline RealType cdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { RealType const& degrees_of_freedom = c.dist.degrees_of_freedom(); RealType const& chi_square = c.param; - static const char* function = "boost::math::cdf(const chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const chi_squared_distribution<%1%>&, %1%)"; // Error check: RealType error_result; if(false == detail::check_df( @@ -191,11 +194,11 @@ inline RealType cdf(const complemented2_type -inline RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { RealType const& degrees_of_freedom = c.dist.degrees_of_freedom(); RealType const& q = c.param; - static const char* function = "boost::math::quantile(const chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const chi_squared_distribution<%1%>&, %1%)"; // Error check: RealType error_result; if(false == ( @@ -208,22 +211,22 @@ inline RealType quantile(const complemented2_type -inline RealType mean(const chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mean(const chi_squared_distribution& dist) { // Mean of Chi-Squared distribution = v. return dist.degrees_of_freedom(); } // mean template -inline RealType variance(const chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType variance(const chi_squared_distribution& dist) { // Variance of Chi-Squared distribution = 2v. return 2 * dist.degrees_of_freedom(); } // variance template -inline RealType mode(const chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mode(const chi_squared_distribution& dist) { RealType df = dist.degrees_of_freedom(); - static const char* function = "boost::math::mode(const chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::mode(const chi_squared_distribution<%1%>&)"; if(df < 2) return policies::raise_domain_error( @@ -234,7 +237,7 @@ inline RealType mode(const chi_squared_distribution& dist) } template -inline RealType skewness(const chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType skewness(const chi_squared_distribution& dist) { BOOST_MATH_STD_USING // For ADL RealType df = dist.degrees_of_freedom(); @@ -242,14 +245,14 @@ inline RealType skewness(const chi_squared_distribution& dist) } template -inline RealType kurtosis(const chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const chi_squared_distribution& dist) { RealType df = dist.degrees_of_freedom(); return 3 + 12 / df; } template -inline RealType kurtosis_excess(const chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const chi_squared_distribution& dist) { RealType df = dist.degrees_of_freedom(); return 12 / df; @@ -264,12 +267,12 @@ namespace detail template struct df_estimator { - df_estimator(RealType a, RealType b, RealType variance, RealType delta) + BOOST_MATH_GPU_ENABLED df_estimator(RealType a, RealType b, RealType variance, RealType delta) : alpha(a), beta(b), ratio(delta/variance) { // Constructor } - RealType operator()(const RealType& df) + BOOST_MATH_GPU_ENABLED RealType operator()(const RealType& df) { if(df <= tools::min_value()) return 1; @@ -297,14 +300,14 @@ struct df_estimator } // namespace detail template -RealType chi_squared_distribution::find_degrees_of_freedom( +BOOST_MATH_GPU_ENABLED RealType chi_squared_distribution::find_degrees_of_freedom( RealType difference_from_variance, RealType alpha, RealType beta, RealType variance, RealType hint) { - static const char* function = "boost::math::chi_squared_distribution<%1%>::find_degrees_of_freedom(%1%,%1%,%1%,%1%,%1%)"; + constexpr auto function = "boost::math::chi_squared_distribution<%1%>::find_degrees_of_freedom(%1%,%1%,%1%,%1%,%1%)"; // Check for domain errors: RealType error_result; if(false == @@ -321,8 +324,8 @@ RealType chi_squared_distribution::find_degrees_of_freedom( detail::df_estimator f(alpha, beta, variance, difference_from_variance); tools::eps_tolerance tol(policies::digits()); - std::uintmax_t max_iter = policies::get_max_root_iterations(); - std::pair r = + boost::math::uintmax_t max_iter = policies::get_max_root_iterations(); + boost::math::pair r = tools::bracket_and_solve_root(f, hint, RealType(2), false, tol, max_iter, Policy()); RealType result = r.first + (r.second - r.first) / 2; if(max_iter >= policies::get_max_root_iterations()) From a8ecdc45eb35f688c66203e9c2c16e623908e193 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 13:59:51 -0400 Subject: [PATCH 33/40] Fix static local variable --- include/boost/math/tools/precision.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/math/tools/precision.hpp b/include/boost/math/tools/precision.hpp index 1e4bf58ac..662657732 100644 --- a/include/boost/math/tools/precision.hpp +++ b/include/boost/math/tools/precision.hpp @@ -290,7 +290,7 @@ template BOOST_MATH_GPU_ENABLED inline T root_epsilon_imp(const T*, const Tag&) { BOOST_MATH_STD_USING - static const T r_eps = sqrt(tools::epsilon()); + BOOST_MATH_STATIC_LOCAL_VARIABLE const T r_eps = sqrt(tools::epsilon()); return r_eps; } From 8119593f41ef6451943e13211bbdd6db0b144ade Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 14:00:01 -0400 Subject: [PATCH 34/40] Add chi squared dist SYCL testing --- test/sycl_jamfile | 1 + test/test_chi_squared.cpp | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 009e172d7..1a8a14b3b 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -13,6 +13,7 @@ project : requirements run test_arcsine.cpp ; run test_bernoulli.cpp ; run test_cauchy.cpp ; +run test_chi_squared.cpp ; run test_exponential_dist.cpp ; run test_extreme_value.cpp ; run test_holtsmark.cpp ; diff --git a/test/test_chi_squared.cpp b/test/test_chi_squared.cpp index cc7747a6c..bfd4b5f3a 100644 --- a/test/test_chi_squared.cpp +++ b/test/test_chi_squared.cpp @@ -16,9 +16,13 @@ # pragma warning(disable: 4127) // conditional expression is constant #endif -#include // for real_concept +#include +#include "../include_private/boost/math/tools/test.hpp" + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept using ::boost::math::concepts::real_concept; +#endif #include // for chi_squared_distribution #include // for chi_squared_distribution From 90e201bf5c21ef5e940ba1492d10e9a6338836dd Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 14:13:19 -0400 Subject: [PATCH 35/40] Add chi squared dist CUDA testing --- test/cuda_jamfile | 7 ++ test/test_chi_squared_cdf_double.cu | 109 +++++++++++++++++++++++++++ test/test_chi_squared_cdf_float.cu | 109 +++++++++++++++++++++++++++ test/test_chi_squared_pdf_double.cu | 109 +++++++++++++++++++++++++++ test/test_chi_squared_pdf_float.cu | 109 +++++++++++++++++++++++++++ test/test_chi_squared_quan_double.cu | 109 +++++++++++++++++++++++++++ test/test_chi_squared_quan_float.cu | 109 +++++++++++++++++++++++++++ 7 files changed, 661 insertions(+) create mode 100644 test/test_chi_squared_cdf_double.cu create mode 100644 test/test_chi_squared_cdf_float.cu create mode 100644 test/test_chi_squared_pdf_double.cu create mode 100644 test/test_chi_squared_pdf_float.cu create mode 100644 test/test_chi_squared_quan_double.cu create mode 100644 test/test_chi_squared_quan_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 5fc9eaff3..420128ee2 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -35,6 +35,13 @@ run test_cauchy_quan_float.cu ; run test_cauchy_range_support_double.cu ; run test_cauchy_range_support_float.cu ; +run test_chi_squared_cdf_double.cu ; +run test_chi_squared_cdf_float.cu ; +run test_chi_squared_pdf_double.cu ; +run test_chi_squared_pdf_float.cu ; +run test_chi_squared_quan_double.cu ; +run test_chi_squared_quan_float.cu ; + run test_exponential_cdf_double.cu ; run test_exponential_cdf_float.cu ; run test_exponential_pdf_double.cu ; diff --git a/test/test_chi_squared_cdf_double.cu b/test/test_chi_squared_cdf_double.cu new file mode 100644 index 000000000..1b0c34ce6 --- /dev/null +++ b/test/test_chi_squared_cdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::chi_squared_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::chi_squared_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_chi_squared_cdf_float.cu b/test/test_chi_squared_cdf_float.cu new file mode 100644 index 000000000..8ca99ed2e --- /dev/null +++ b/test/test_chi_squared_cdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::chi_squared_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::chi_squared_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_chi_squared_pdf_double.cu b/test/test_chi_squared_pdf_double.cu new file mode 100644 index 000000000..ed45246d3 --- /dev/null +++ b/test/test_chi_squared_pdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::chi_squared_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::chi_squared_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_chi_squared_pdf_float.cu b/test/test_chi_squared_pdf_float.cu new file mode 100644 index 000000000..5a0f97db9 --- /dev/null +++ b/test/test_chi_squared_pdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::chi_squared_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::chi_squared_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_chi_squared_quan_double.cu b/test/test_chi_squared_quan_double.cu new file mode 100644 index 000000000..3b7dad972 --- /dev/null +++ b/test/test_chi_squared_quan_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::chi_squared_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::chi_squared_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_chi_squared_quan_float.cu b/test/test_chi_squared_quan_float.cu new file mode 100644 index 000000000..3e779a090 --- /dev/null +++ b/test/test_chi_squared_quan_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::chi_squared_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::chi_squared_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file From 17cd53381d0896ba623e3c367df4fdeeca30c470 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 14:20:48 -0400 Subject: [PATCH 36/40] Add chi squared dist NVRTC testing --- test/nvrtc_jamfile | 7 + test/test_chi_squared_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++++ test/test_chi_squared_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++++ test/test_chi_squared_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++++ test/test_chi_squared_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++++ test/test_chi_squared_quan_nvrtc_double.cpp | 191 ++++++++++++++++++++ test/test_chi_squared_quan_nvrtc_float.cpp | 191 ++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_chi_squared_cdf_nvrtc_double.cpp create mode 100644 test/test_chi_squared_cdf_nvrtc_float.cpp create mode 100644 test/test_chi_squared_pdf_nvrtc_double.cpp create mode 100644 test/test_chi_squared_pdf_nvrtc_float.cpp create mode 100644 test/test_chi_squared_quan_nvrtc_double.cpp create mode 100644 test/test_chi_squared_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 06d288160..bff853548 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -31,6 +31,13 @@ run test_cauchy_pdf_nvrtc_float.cpp ; run test_cauchy_quan_nvrtc_double.cpp ; run test_cauchy_quan_nvrtc_float.cpp ; +run test_chi_squared_cdf_nvrtc_double.cpp ; +run test_chi_squared_cdf_nvrtc_float.cpp ; +run test_chi_squared_pdf_nvrtc_double.cpp ; +run test_chi_squared_pdf_nvrtc_float.cpp ; +run test_chi_squared_quan_nvrtc_double.cpp ; +run test_chi_squared_quan_nvrtc_float.cpp ; + run test_exponential_cdf_nvrtc_double.cpp ; run test_exponential_cdf_nvrtc_float.cpp ; run test_exponential_pdf_nvrtc_double.cpp ; diff --git a/test/test_chi_squared_cdf_nvrtc_double.cpp b/test/test_chi_squared_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..0ad459fa6 --- /dev/null +++ b/test/test_chi_squared_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::chi_squared_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::chi_squared_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_chi_squared_cdf_nvrtc_float.cpp b/test/test_chi_squared_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..1b26c5d6f --- /dev/null +++ b/test/test_chi_squared_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::chi_squared_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::chi_squared_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_chi_squared_pdf_nvrtc_double.cpp b/test/test_chi_squared_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..18d14a4b0 --- /dev/null +++ b/test/test_chi_squared_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::chi_squared_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::chi_squared_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_chi_squared_pdf_nvrtc_float.cpp b/test/test_chi_squared_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..754cbf7fb --- /dev/null +++ b/test/test_chi_squared_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::chi_squared_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::chi_squared_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_chi_squared_quan_nvrtc_double.cpp b/test/test_chi_squared_quan_nvrtc_double.cpp new file mode 100644 index 000000000..69b15b6cf --- /dev/null +++ b/test/test_chi_squared_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::chi_squared_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::chi_squared_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_chi_squared_quan_nvrtc_float.cpp b/test/test_chi_squared_quan_nvrtc_float.cpp new file mode 100644 index 000000000..d6e1b2a9b --- /dev/null +++ b/test/test_chi_squared_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::chi_squared_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::chi_squared_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 2199d55ac9b06c6194fddc2de3408cddd254ef44 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 14:34:05 -0400 Subject: [PATCH 37/40] Add GPU support to weibull dist --- include/boost/math/distributions/weibull.hpp | 89 ++++++++++---------- 1 file changed, 46 insertions(+), 43 deletions(-) diff --git a/include/boost/math/distributions/weibull.hpp b/include/boost/math/distributions/weibull.hpp index ca4bbd7b5..eb4de106c 100644 --- a/include/boost/math/distributions/weibull.hpp +++ b/include/boost/math/distributions/weibull.hpp @@ -1,4 +1,5 @@ // Copyright John Maddock 2006. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -9,6 +10,10 @@ // http://www.itl.nist.gov/div898/handbook/eda/section3/eda3668.htm // http://mathworld.wolfram.com/WeibullDistribution.html +#include +#include +#include +#include #include #include #include @@ -16,14 +21,12 @@ #include #include -#include - namespace boost{ namespace math { namespace detail{ template -inline bool check_weibull_shape( +BOOST_MATH_GPU_ENABLED inline bool check_weibull_shape( const char* function, RealType shape, RealType* result, const Policy& pol) @@ -39,7 +42,7 @@ inline bool check_weibull_shape( } template -inline bool check_weibull_x( +BOOST_MATH_GPU_ENABLED inline bool check_weibull_x( const char* function, RealType const& x, RealType* result, const Policy& pol) @@ -55,7 +58,7 @@ inline bool check_weibull_x( } template -inline bool check_weibull( +BOOST_MATH_GPU_ENABLED inline bool check_weibull( const char* function, RealType scale, RealType shape, @@ -73,19 +76,19 @@ class weibull_distribution using value_type = RealType; using policy_type = Policy; - explicit weibull_distribution(RealType l_shape, RealType l_scale = 1) + BOOST_MATH_GPU_ENABLED explicit weibull_distribution(RealType l_shape, RealType l_scale = 1) : m_shape(l_shape), m_scale(l_scale) { RealType result; detail::check_weibull("boost::math::weibull_distribution<%1%>::weibull_distribution", l_scale, l_shape, &result, Policy()); } - RealType shape()const + BOOST_MATH_GPU_ENABLED RealType shape()const { return m_shape; } - RealType scale()const + BOOST_MATH_GPU_ENABLED RealType scale()const { return m_scale; } @@ -107,28 +110,28 @@ weibull_distribution(RealType,RealType)->weibull_distribution -inline std::pair range(const weibull_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline boost::math::pair range(const weibull_distribution& /*dist*/) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -inline std::pair support(const weibull_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline boost::math::pair support(const weibull_distribution& /*dist*/) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. using boost::math::tools::max_value; using boost::math::tools::min_value; - return std::pair(min_value(), max_value()); + return boost::math::pair(min_value(), max_value()); // A discontinuity at x == 0, so only support down to min_value. } template -inline RealType pdf(const weibull_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType pdf(const weibull_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::pdf(const weibull_distribution<%1%>, %1%)"; + constexpr auto function = "boost::math::pdf(const weibull_distribution<%1%>, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -158,11 +161,11 @@ inline RealType pdf(const weibull_distribution& dist, const Re } template -inline RealType logpdf(const weibull_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType logpdf(const weibull_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::logpdf(const weibull_distribution<%1%>, %1%)"; + constexpr auto function = "boost::math::logpdf(const weibull_distribution<%1%>, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -192,11 +195,11 @@ inline RealType logpdf(const weibull_distribution& dist, const } template -inline RealType cdf(const weibull_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const weibull_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::cdf(const weibull_distribution<%1%>, %1%)"; + constexpr auto function = "boost::math::cdf(const weibull_distribution<%1%>, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -213,11 +216,11 @@ inline RealType cdf(const weibull_distribution& dist, const Re } template -inline RealType logcdf(const weibull_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType logcdf(const weibull_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::logcdf(const weibull_distribution<%1%>, %1%)"; + constexpr auto function = "boost::math::logcdf(const weibull_distribution<%1%>, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -234,11 +237,11 @@ inline RealType logcdf(const weibull_distribution& dist, const } template -inline RealType quantile(const weibull_distribution& dist, const RealType& p) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const weibull_distribution& dist, const RealType& p) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const weibull_distribution<%1%>, %1%)"; + constexpr auto function = "boost::math::quantile(const weibull_distribution<%1%>, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -258,11 +261,11 @@ inline RealType quantile(const weibull_distribution& dist, con } template -inline RealType cdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::cdf(const weibull_distribution<%1%>, %1%)"; + constexpr auto function = "boost::math::cdf(const weibull_distribution<%1%>, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -279,11 +282,11 @@ inline RealType cdf(const complemented2_type -inline RealType logcdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType logcdf(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::logcdf(const weibull_distribution<%1%>, %1%)"; + constexpr auto function = "boost::math::logcdf(const weibull_distribution<%1%>, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -300,11 +303,11 @@ inline RealType logcdf(const complemented2_type -inline RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const weibull_distribution<%1%>, %1%)"; + constexpr auto function = "boost::math::quantile(const weibull_distribution<%1%>, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -325,11 +328,11 @@ inline RealType quantile(const complemented2_type -inline RealType mean(const weibull_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mean(const weibull_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::mean(const weibull_distribution<%1%>)"; + constexpr auto function = "boost::math::mean(const weibull_distribution<%1%>)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -343,12 +346,12 @@ inline RealType mean(const weibull_distribution& dist) } template -inline RealType variance(const weibull_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType variance(const weibull_distribution& dist) { RealType shape = dist.shape(); RealType scale = dist.scale(); - static const char* function = "boost::math::variance(const weibull_distribution<%1%>)"; + constexpr auto function = "boost::math::variance(const weibull_distribution<%1%>)"; RealType result = 0; if(false == detail::check_weibull(function, scale, shape, &result, Policy())) @@ -363,11 +366,11 @@ inline RealType variance(const weibull_distribution& dist) } template -inline RealType mode(const weibull_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mode(const weibull_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std function pow. - static const char* function = "boost::math::mode(const weibull_distribution<%1%>)"; + constexpr auto function = "boost::math::mode(const weibull_distribution<%1%>)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -384,11 +387,11 @@ inline RealType mode(const weibull_distribution& dist) } template -inline RealType median(const weibull_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType median(const weibull_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std function pow. - static const char* function = "boost::math::median(const weibull_distribution<%1%>)"; + constexpr auto function = "boost::math::median(const weibull_distribution<%1%>)"; RealType shape = dist.shape(); // Wikipedia k RealType scale = dist.scale(); // Wikipedia lambda @@ -404,11 +407,11 @@ inline RealType median(const weibull_distribution& dist) } template -inline RealType skewness(const weibull_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType skewness(const weibull_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::skewness(const weibull_distribution<%1%>)"; + constexpr auto function = "boost::math::skewness(const weibull_distribution<%1%>)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -429,11 +432,11 @@ inline RealType skewness(const weibull_distribution& dist) } template -inline RealType kurtosis_excess(const weibull_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const weibull_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::kurtosis_excess(const weibull_distribution<%1%>)"; + constexpr auto function = "boost::math::kurtosis_excess(const weibull_distribution<%1%>)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -457,15 +460,15 @@ inline RealType kurtosis_excess(const weibull_distribution& di } template -inline RealType kurtosis(const weibull_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const weibull_distribution& dist) { return kurtosis_excess(dist) + 3; } template -inline RealType entropy(const weibull_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType entropy(const weibull_distribution& dist) { - using std::log; + BOOST_MATH_STD_USING RealType k = dist.shape(); RealType lambda = dist.scale(); return constants::euler()*(1-1/k) + log(lambda/k) + 1; From 8ff4b3add996d5e75c3f938f6bfc0997faefb00f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 14:34:16 -0400 Subject: [PATCH 38/40] Add weibull dist SYCL testing --- test/sycl_jamfile | 1 + test/test_weibull.cpp | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 1a8a14b3b..2fd5954ae 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -22,6 +22,7 @@ run test_laplace.cpp ; run test_logistic_dist.cpp ; run test_mapairy.cpp ; run test_saspoint5.cpp ; +run test_weibull.cpp ; # Special Functions run pow_test.cpp ; diff --git a/test/test_weibull.cpp b/test/test_weibull.cpp index 4b31a7f0b..dc509b742 100644 --- a/test/test_weibull.cpp +++ b/test/test_weibull.cpp @@ -12,15 +12,17 @@ # pragma warning (disable : 4127) // conditional expression is constant. #endif - +#include +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept +#endif #define BOOST_TEST_MAIN #include // Boost.Test #include #include using boost::math::weibull_distribution; -#include +#include "../include_private/boost/math/tools/test.hpp" #include "test_out_of_range.hpp" #include From 04a04496f1b516791ed77a5789b96bf78937b2f3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 14:40:29 -0400 Subject: [PATCH 39/40] Add weibull dist CUDA testing --- test/cuda_jamfile | 7 ++ test/test_weibull_cdf_double.cu | 109 +++++++++++++++++++++++++++++++ test/test_weibull_cdf_float.cu | 109 +++++++++++++++++++++++++++++++ test/test_weibull_pdf_double.cu | 109 +++++++++++++++++++++++++++++++ test/test_weibull_pdf_float.cu | 109 +++++++++++++++++++++++++++++++ test/test_weibull_quan_double.cu | 109 +++++++++++++++++++++++++++++++ test/test_weibull_quan_float.cu | 109 +++++++++++++++++++++++++++++++ 7 files changed, 661 insertions(+) create mode 100644 test/test_weibull_cdf_double.cu create mode 100644 test/test_weibull_cdf_float.cu create mode 100644 test/test_weibull_pdf_double.cu create mode 100644 test/test_weibull_pdf_float.cu create mode 100644 test/test_weibull_quan_double.cu create mode 100644 test/test_weibull_quan_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 420128ee2..aa286b392 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -98,6 +98,13 @@ run test_saspoint5_pdf_float.cu ; run test_saspoint5_quan_double.cu ; run test_saspoint5_quan_float.cu ; +run test_weibull_cdf_double.cu ; +run test_weibull_cdf_float.cu ; +run test_weibull_pdf_double.cu ; +run test_weibull_pdf_float.cu ; +run test_weibull_quan_double.cu ; +run test_weibull_quan_float.cu ; + # Special Functions run test_beta_double.cu ; run test_beta_float.cu ; diff --git a/test/test_weibull_cdf_double.cu b/test/test_weibull_cdf_double.cu new file mode 100644 index 000000000..65efbe252 --- /dev/null +++ b/test/test_weibull_cdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::weibull_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::weibull_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_weibull_cdf_float.cu b/test/test_weibull_cdf_float.cu new file mode 100644 index 000000000..65c3ce1ff --- /dev/null +++ b/test/test_weibull_cdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::weibull_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::weibull_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_weibull_pdf_double.cu b/test/test_weibull_pdf_double.cu new file mode 100644 index 000000000..645df4c0a --- /dev/null +++ b/test/test_weibull_pdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::weibull_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::weibull_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_weibull_pdf_float.cu b/test/test_weibull_pdf_float.cu new file mode 100644 index 000000000..f1e6917f0 --- /dev/null +++ b/test/test_weibull_pdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::weibull_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::weibull_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_weibull_quan_double.cu b/test/test_weibull_quan_double.cu new file mode 100644 index 000000000..2f0500602 --- /dev/null +++ b/test/test_weibull_quan_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::weibull_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::weibull_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_weibull_quan_float.cu b/test/test_weibull_quan_float.cu new file mode 100644 index 000000000..3027e14dd --- /dev/null +++ b/test/test_weibull_quan_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::weibull_distribution(1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::weibull_distribution(1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file From c5e7637d8f2c9f16dd62e8e9c73855ee24c98259 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 27 Aug 2024 14:45:31 -0400 Subject: [PATCH 40/40] Add weibull dist NVRTC testing --- test/nvrtc_jamfile | 7 + test/test_weibull_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++++ test/test_weibull_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++++ test/test_weibull_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++++ test/test_weibull_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++++ test/test_weibull_quan_nvrtc_double.cpp | 191 ++++++++++++++++++++++++ test/test_weibull_quan_nvrtc_float.cpp | 191 ++++++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_weibull_cdf_nvrtc_double.cpp create mode 100644 test/test_weibull_cdf_nvrtc_float.cpp create mode 100644 test/test_weibull_pdf_nvrtc_double.cpp create mode 100644 test/test_weibull_pdf_nvrtc_float.cpp create mode 100644 test/test_weibull_quan_nvrtc_double.cpp create mode 100644 test/test_weibull_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index bff853548..4a37960a5 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -94,6 +94,13 @@ run test_saspoint5_pdf_nvrtc_float.cpp ; run test_saspoint5_quan_nvrtc_double.cpp ; run test_saspoint5_quan_nvrtc_float.cpp ; +run test_weibull_cdf_nvrtc_double.cpp ; +run test_weibull_cdf_nvrtc_float.cpp ; +run test_weibull_pdf_nvrtc_double.cpp ; +run test_weibull_pdf_nvrtc_float.cpp ; +run test_weibull_quan_nvrtc_double.cpp ; +run test_weibull_quan_nvrtc_float.cpp ; + # Special Functions run test_beta_nvrtc_double.cpp ; run test_beta_nvrtc_float.cpp ; diff --git a/test/test_weibull_cdf_nvrtc_double.cpp b/test/test_weibull_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..60d5ff5af --- /dev/null +++ b/test/test_weibull_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_weibull_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::weibull_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_weibull_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_weibull_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_weibull_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::weibull_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_weibull_cdf_nvrtc_float.cpp b/test/test_weibull_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..5085b2f7d --- /dev/null +++ b/test/test_weibull_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_weibull_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::weibull_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_weibull_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_weibull_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_weibull_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::weibull_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_weibull_pdf_nvrtc_double.cpp b/test/test_weibull_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..2e5e237b2 --- /dev/null +++ b/test/test_weibull_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_weibull_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::weibull_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_weibull_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_weibull_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_weibull_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::weibull_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_weibull_pdf_nvrtc_float.cpp b/test/test_weibull_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..6c3c5202c --- /dev/null +++ b/test/test_weibull_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_weibull_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::weibull_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_weibull_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_weibull_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_weibull_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::weibull_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_weibull_quan_nvrtc_double.cpp b/test/test_weibull_quan_nvrtc_double.cpp new file mode 100644 index 000000000..aed31865e --- /dev/null +++ b/test/test_weibull_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_weibull_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::weibull_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_weibull_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_weibull_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_weibull_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::weibull_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_weibull_quan_nvrtc_float.cpp b/test/test_weibull_quan_nvrtc_float.cpp new file mode 100644 index 000000000..98997b354 --- /dev/null +++ b/test/test_weibull_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_weibull_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::weibull_distribution(1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_weibull_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_weibull_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_weibull_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::weibull_distribution(1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +}