From b1d4f142dddbc12222fa965225e95ee787708ac5 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 30 Aug 2024 15:46:17 -0400 Subject: [PATCH 01/31] Add overview --- doc/math.qbk | 1 + doc/overview/gpu.qbk | 59 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 doc/overview/gpu.qbk diff --git a/doc/math.qbk b/doc/math.qbk index 30bc53339..1ea17752d 100644 --- a/doc/math.qbk +++ b/doc/math.qbk @@ -557,6 +557,7 @@ and as a CD ISBN 0-9504833-2-X 978-0-9504833-2-0, Classification 519.2-dc22. [include overview/standalone.qbk] [include overview/result_type_calc.qbk] [include overview/error_handling.qbk] +[include overview/gpu.qbk] [section:compilers_overview Compilers] [compilers_overview] diff --git a/doc/overview/gpu.qbk b/doc/overview/gpu.qbk new file mode 100644 index 000000000..70f0164e0 --- /dev/null +++ b/doc/overview/gpu.qbk @@ -0,0 +1,59 @@ +[section:gpu Support for GPU programming in Boost.Math] + +[h4 GPU Support] + +Selected functions, distributions, tools, etc. support running on both host and devices. +These functions will have the annotation `BOOST_MATH_GPU_ENABLED` next to their individual documentation. +We test using CUDA (both NVCC and NVRTC) as well as SYCL to provide a wide range of support. + +[h4 How to build with device support] + +When compiling with CUDA or SYCL you will have to ensure that your code is being run inside of a kernel function. +It is not enough to simply compile existing code with the NVCC compiler to run the code on the device. +A simple CUDA kernel to run the Beta Distribution CDF on NVCC would be: + + __global__ void cuda_beta_dist(const double* in, double* out, int num_elements) + { + const int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < num_elements) + { + out[i] = cdf(boost::math::beta_distribution(), in[i]); + } + } + +And on CUDA on NVRTC: + + const char* cuda_kernel = R"( + #include + extern "C" __global__ + void test_beta_dist_kernel(const double* in, double* out, int num_elements) + { + const int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < num_elements) + { + out[i] = boost::math::cdf(boost::math::beta_distribution(), in[i]); + } + } + )"; + +And lastly on SYCL: + + void sycl_beta_dist(const double* in, double* out, int num_elements, sycl::queue& q) + { + q.submit([&](sycl::handler& h) { + h.parallel_for(sycl::range<1>(num_elements), [=](sycl::id<1> i) { + out[i] = boost::math::cdf(boost::math::beta_distribution(), in[i]); + }); + }); + } + +Once your kernel function has been written then use the framework mechanism for launching the kernel. + +[/ + Copyright 2024. Matt Borland + Distributed under the Boost Software License, Version 1.0. + (See accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt). +] + From cb99a775e736a06ffe538fdd63f39223309a6486 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 30 Aug 2024 15:46:33 -0400 Subject: [PATCH 02/31] Annotate GPU markers in constants doc with type caveat --- doc/constants/constants.qbk | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/constants/constants.qbk b/doc/constants/constants.qbk index 24092adf5..9cce152da 100644 --- a/doc/constants/constants.qbk +++ b/doc/constants/constants.qbk @@ -227,6 +227,11 @@ either construct from a decimal digit string or calculate on the fly depending u [[Any other value ['N]][Sets the compile time precision to ['N] bits.]] ] +[h5 GPU Support] + +All Boost.Math constants are marked with `BOOST_MATH_GPU_ENABLED` and can be used on both host and device. +Note that when running on device you are limited to using only `float` and `double` types. + [h5 Custom Specializing a constant] In addition, for user-defined types that need special handling, it's possible to partially-specialize From 603ffd2adda30e1fefb7761c3236fdebe32be1e3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 30 Aug 2024 15:46:47 -0400 Subject: [PATCH 03/31] Add GPU markers to supported dists docs --- doc/distributions/arcsine.qbk | 12 +++++++----- doc/distributions/bernoulli.qbk | 10 ++++++---- doc/distributions/beta.qbk | 22 ++++++++++++---------- doc/distributions/cauchy.qbk | 14 ++++++++------ doc/distributions/chi_squared.qbk | 8 +++++--- doc/distributions/exponential.qbk | 10 ++++++---- doc/distributions/extreme_value.qbk | 14 ++++++++------ doc/distributions/holtsmark.qbk | 14 ++++++++------ doc/distributions/landau.qbk | 18 ++++++++++-------- doc/distributions/laplace.qbk | 14 ++++++++------ doc/distributions/logistic.qbk | 14 ++++++++------ doc/distributions/mapairy.qbk | 14 ++++++++------ doc/distributions/saspoint5.qbk | 14 ++++++++------ doc/distributions/weibull.qbk | 14 ++++++++------ 14 files changed, 110 insertions(+), 82 deletions(-) diff --git a/doc/distributions/arcsine.qbk b/doc/distributions/arcsine.qbk index fbd6e86b1..7930f97d5 100644 --- a/doc/distributions/arcsine.qbk +++ b/doc/distributions/arcsine.qbk @@ -21,11 +21,11 @@ typedef Policy policy_type; // Constructor from two range parameters, x_min and x_max: - arcsine_distribution(RealType x_min = 0, RealType x_max = 1); + BOOST_MATH_GPU_ENABLED arcsine_distribution(RealType x_min = 0, RealType x_max = 1); // Range Parameter accessors: - RealType x_min() const; - RealType x_max() const; + BOOST_MATH_GPU_ENABLED RealType x_min() const; + BOOST_MATH_GPU_ENABLED RealType x_max() const; }; }} // namespaces @@ -103,8 +103,8 @@ constructs a 'Standard 01' arcsine distribution. [h5 Parameter Accessors] - RealType x_min() const; - RealType x_max() const; + BOOST_MATH_GPU_ENABLED RealType x_min() const; + BOOST_MATH_GPU_ENABLED RealType x_max() const; Return the parameter ['x_min] or ['x_max] from which this distribution was constructed. @@ -116,6 +116,8 @@ So, for example: All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The formulae for calculating these are shown in the table below, and at [@http://mathworld.wolfram.com/arcsineDistribution.html Wolfram Mathworld]. diff --git a/doc/distributions/bernoulli.qbk b/doc/distributions/bernoulli.qbk index 4a2fc7b61..719c42cd9 100644 --- a/doc/distributions/bernoulli.qbk +++ b/doc/distributions/bernoulli.qbk @@ -16,9 +16,9 @@ typedef RealType value_type; typedef Policy policy_type; - bernoulli_distribution(RealType p); // Constructor. + BOOST_MATH_GPU_ENABLED bernoulli_distribution(RealType p); // Constructor. // Accessor function. - RealType success_fraction() const + BOOST_MATH_GPU_ENABLED RealType success_fraction() const // Probability of success (as a fraction). }; }} // namespaces @@ -51,12 +51,12 @@ and the [@http://en.wikipedia.org/wiki/Cumulative_Distribution_Function Cumulati [h4 Member Functions] - bernoulli_distribution(RealType p); + BOOST_MATH_GPU_ENABLED bernoulli_distribution(RealType p); Constructs a [@http://en.wikipedia.org/wiki/bernoulli_distribution bernoulli distribution] with success_fraction /p/. - RealType success_fraction() const + BOOST_MATH_GPU_ENABLED RealType success_fraction() const Returns the /success_fraction/ parameter of this distribution. @@ -64,6 +64,8 @@ Returns the /success_fraction/ parameter of this distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The domain of the random variable is 0 and 1, and the useful supported range is only 0 or 1. diff --git a/doc/distributions/beta.qbk b/doc/distributions/beta.qbk index 95943f715..5ba1a6d1c 100644 --- a/doc/distributions/beta.qbk +++ b/doc/distributions/beta.qbk @@ -19,30 +19,30 @@ typedef RealType value_type; typedef Policy policy_type; // Constructor from two shape parameters, alpha & beta: - beta_distribution(RealType a, RealType b); + BOOST_MATH_GPU_ENABLED beta_distribution(RealType a, RealType b); // Parameter accessors: - RealType alpha() const; - RealType beta() const; + BOOST_MATH_GPU_ENABLED RealType alpha() const; + BOOST_MATH_GPU_ENABLED RealType beta() const; // Parameter estimators of alpha or beta from mean and variance. - static RealType find_alpha( + BOOST_MATH_GPU_ENABLED static RealType find_alpha( RealType mean, // Expected value of mean. RealType variance); // Expected value of variance. - static RealType find_beta( + BOOST_MATH_GPU_ENABLED static RealType find_beta( RealType mean, // Expected value of mean. RealType variance); // Expected value of variance. // Parameter estimators from // either alpha or beta, and x and probability. - static RealType find_alpha( + BOOST_MATH_GPU_ENABLED static RealType find_alpha( RealType beta, // from beta. RealType x, // x. RealType probability); // cdf - static RealType find_beta( + BOOST_MATH_GPU_ENABLED static RealType find_beta( RealType alpha, // alpha. RealType x, // probability x. RealType probability); // probability cdf. @@ -98,7 +98,7 @@ whose apex is away from the centre (where x = half). [h5 Constructor] - beta_distribution(RealType alpha, RealType beta); + BOOST_MATH_GPU_ENABLED beta_distribution(RealType alpha, RealType beta); Constructs a beta distribution with shape parameters /alpha/ and /beta/. @@ -117,11 +117,11 @@ in the graph above). [h5 Parameter Accessors] - RealType alpha() const; + BOOST_MATH_GPU_ENABLED RealType alpha() const; Returns the parameter /alpha/ from which this distribution was constructed. - RealType beta() const; + BOOST_MATH_GPU_ENABLED RealType beta() const; Returns the parameter /beta/ from which this distribution was constructed. @@ -182,6 +182,8 @@ Returns the value of [beta] that gives: All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The formulae for calculating these are shown in the table below, and at [@http://mathworld.wolfram.com/BetaDistribution.html Wolfram Mathworld]. diff --git a/doc/distributions/cauchy.qbk b/doc/distributions/cauchy.qbk index 6ae090818..4a177d294 100644 --- a/doc/distributions/cauchy.qbk +++ b/doc/distributions/cauchy.qbk @@ -15,10 +15,10 @@ typedef RealType value_type; typedef Policy policy_type; - cauchy_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED cauchy_distribution(RealType location = 0, RealType scale = 1); - RealType location()const; - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType location()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; }; The [@http://en.wikipedia.org/wiki/Cauchy_distribution Cauchy-Lorentz distribution] @@ -53,7 +53,7 @@ the distribution: [h4 Member Functions] - cauchy_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED cauchy_distribution(RealType location = 0, RealType scale = 1); Constructs a Cauchy distribution, with location parameter /location/ and scale parameter /scale/. When these parameters take their default @@ -62,11 +62,11 @@ then the result is a Standard Cauchy Distribution. Requires scale > 0, otherwise calls __domain_error. - RealType location()const; + BOOST_MATH_GPU_ENABLED RealType location()const; Returns the location parameter of the distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the scale parameter of the distribution. @@ -74,6 +74,8 @@ Returns the scale parameter of the distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. Note however that the Cauchy distribution does not have a mean, standard deviation, etc. See __math_undefined diff --git a/doc/distributions/chi_squared.qbk b/doc/distributions/chi_squared.qbk index 753e1f401..b52d4d392 100644 --- a/doc/distributions/chi_squared.qbk +++ b/doc/distributions/chi_squared.qbk @@ -18,13 +18,13 @@ typedef Policy policy_type; // Constructor: - chi_squared_distribution(RealType i); + BOOST_MATH_GPU_ENABLED chi_squared_distribution(RealType i); // Accessor to parameter: - RealType degrees_of_freedom()const; + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const; // Parameter estimation: - static RealType find_degrees_of_freedom( + BOOST_MATH_GPU_ENABLED static RealType find_degrees_of_freedom( RealType difference_from_mean, RealType alpha, RealType beta, @@ -104,6 +104,8 @@ See also section on Sample sizes required in All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. (We have followed the usual restriction of the mode to degrees of freedom >= 2, but note that the maximum of the pdf is actually zero for degrees of freedom from 2 down to 0, diff --git a/doc/distributions/exponential.qbk b/doc/distributions/exponential.qbk index 043818b4a..5214df7d4 100644 --- a/doc/distributions/exponential.qbk +++ b/doc/distributions/exponential.qbk @@ -15,9 +15,9 @@ typedef RealType value_type; typedef Policy policy_type; - exponential_distribution(RealType lambda = 1); + BOOST_MATH_GPU_ENABLED exponential_distribution(RealType lambda = 1); - RealType lambda()const; + BOOST_MATH_GPU_ENABLED RealType lambda()const; }; @@ -37,7 +37,7 @@ values of the rate parameter lambda: [h4 Member Functions] - exponential_distribution(RealType lambda = 1); + BOOST_MATH_GPU_ENABLED exponential_distribution(RealType lambda = 1); Constructs an [@http://en.wikipedia.org/wiki/Exponential_distribution Exponential distribution] @@ -46,7 +46,7 @@ Lambda is defined as the reciprocal of the scale parameter. Requires lambda > 0, otherwise calls __domain_error. - RealType lambda()const; + BOOST_MATH_GPU_ENABLED RealType lambda()const; Accessor function returns the lambda parameter of the distribution. @@ -54,6 +54,8 @@ Accessor function returns the lambda parameter of the distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The domain of the random variable is \[0, +[infin]\]. diff --git a/doc/distributions/extreme_value.qbk b/doc/distributions/extreme_value.qbk index 314917ebc..bc4e27039 100644 --- a/doc/distributions/extreme_value.qbk +++ b/doc/distributions/extreme_value.qbk @@ -14,10 +14,10 @@ public: typedef RealType value_type; - extreme_value_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED extreme_value_distribution(RealType location = 0, RealType scale = 1); - RealType scale()const; - RealType location()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType location()const; }; There are various @@ -59,18 +59,18 @@ And this graph illustrates how the PDF varies with the shape parameter: [h4 Member Functions] - extreme_value_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED extreme_value_distribution(RealType location = 0, RealType scale = 1); Constructs an Extreme Value distribution with the specified location and scale parameters. Requires `scale > 0`, otherwise calls __domain_error. - RealType location()const; + BOOST_MATH_GPU_ENABLED RealType location()const; Returns the location parameter of the distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the scale parameter of the distribution. @@ -78,6 +78,8 @@ Returns the scale parameter of the distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The domain of the random parameter is \[-[infin], +[infin]\]. diff --git a/doc/distributions/holtsmark.qbk b/doc/distributions/holtsmark.qbk index 49149ab92..39c42ff13 100644 --- a/doc/distributions/holtsmark.qbk +++ b/doc/distributions/holtsmark.qbk @@ -15,10 +15,10 @@ typedef RealType value_type; typedef Policy policy_type; - holtsmark_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED holtsmark_distribution(RealType location = 0, RealType scale = 1); - RealType location()const; - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType location()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; }; The [@http://en.wikipedia.org/wiki/holtsmark_distribution Holtsmark distribution] @@ -51,7 +51,7 @@ the distribution: [h4 Member Functions] - holtsmark_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED holtsmark_distribution(RealType location = 0, RealType scale = 1); Constructs a holtsmark distribution, with location parameter /location/ and scale parameter /scale/. When these parameters take their default @@ -60,11 +60,11 @@ then the result is a Standard holtsmark Distribution. Requires scale > 0, otherwise calls __domain_error. - RealType location()const; + BOOST_MATH_GPU_ENABLED RealType location()const; Returns the location parameter of the distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the scale parameter of the distribution. @@ -72,6 +72,8 @@ Returns the scale parameter of the distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. Note however that the holtsmark distribution does not have a skewness, kurtosis, etc. See __math_undefined diff --git a/doc/distributions/landau.qbk b/doc/distributions/landau.qbk index b73450504..90dced0aa 100644 --- a/doc/distributions/landau.qbk +++ b/doc/distributions/landau.qbk @@ -15,11 +15,11 @@ typedef RealType value_type; typedef Policy policy_type; - landau_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED landau_distribution(RealType location = 0, RealType scale = 1); - RealType location()const; - RealType scale()const; - RealType bias()const; + BOOST_MATH_GPU_ENABLED RealType location()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType bias()const; }; The [@http://en.wikipedia.org/wiki/landau_distribution Landau distribution] @@ -54,7 +54,7 @@ the distribution: [h4 Member Functions] - landau_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED landau_distribution(RealType location = 0, RealType scale = 1); Constructs a landau distribution, with location parameter /location/ and scale parameter /scale/. When these parameters take their default @@ -63,15 +63,15 @@ then the result is a Standard landau Distribution. Requires scale > 0, otherwise calls __domain_error. - RealType location()const; + BOOST_MATH_GPU_ENABLED RealType location()const; Returns the location parameter of the distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the scale parameter of the distribution. - RealType bias()const; + BOOST_MATH_GPU_ENABLED RealType bias()const; Returns the amount of translation by the scale parameter. [expression bias = - 2 / [pi] log(c)] @@ -80,6 +80,8 @@ Returns the amount of translation by the scale parameter. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. Note however that the landau distribution does not have a mean, standard deviation, etc. See __math_undefined diff --git a/doc/distributions/laplace.qbk b/doc/distributions/laplace.qbk index 93327e022..861c513f4 100644 --- a/doc/distributions/laplace.qbk +++ b/doc/distributions/laplace.qbk @@ -17,10 +17,10 @@ typedef RealType value_type; typedef Policy policy_type; // Construct: - laplace_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED laplace_distribution(RealType location = 0, RealType scale = 1); // Accessors: - RealType location()const; - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType location()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; }; }} // namespaces @@ -49,7 +49,7 @@ Note that the domain of the random variable remains [h4 Member Functions] - laplace_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED laplace_distribution(RealType location = 0, RealType scale = 1); Constructs a laplace distribution with location /location/ and scale /scale/. @@ -61,11 +61,11 @@ The scale parameter is proportional to the standard deviation of the random vari Requires that the scale parameter is greater than zero, otherwise calls __domain_error. - RealType location()const; + BOOST_MATH_GPU_ENABLED RealType location()const; Returns the /location/ parameter of this distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the /scale/ parameter of this distribution. @@ -73,6 +73,8 @@ Returns the /scale/ parameter of this distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The domain of the random variable is \[-[infin],+[infin]\]. diff --git a/doc/distributions/logistic.qbk b/doc/distributions/logistic.qbk index 0a22b48d4..68557eb01 100644 --- a/doc/distributions/logistic.qbk +++ b/doc/distributions/logistic.qbk @@ -15,10 +15,10 @@ typedef RealType value_type; typedef Policy policy_type; // Construct: - logistic_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED logistic_distribution(RealType location = 0, RealType scale = 1); // Accessors: - RealType location()const; // location. - RealType scale()const; // scale. + BOOST_MATH_GPU_ENABLED RealType location()const; // location. + BOOST_MATH_GPU_ENABLED RealType scale()const; // scale. }; @@ -39,17 +39,17 @@ parameters change: [h4 Member Functions] - logistic_distribution(RealType u = 0, RealType s = 1); + BOOST_MATH_GPU_ENABLED logistic_distribution(RealType u = 0, RealType s = 1); Constructs a logistic distribution with location /u/ and scale /s/. Requires `scale > 0`, otherwise a __domain_error is raised. - RealType location()const; + BOOST_MATH_GPU_ENABLED RealType location()const; Returns the location of this distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the scale of this distribution. @@ -57,6 +57,8 @@ Returns the scale of this distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The domain of the random variable is \[-\[max_value\], +\[min_value\]\]. However, the pdf and cdf support inputs of +[infin] and -[infin] diff --git a/doc/distributions/mapairy.qbk b/doc/distributions/mapairy.qbk index 97d624a93..817fb980d 100644 --- a/doc/distributions/mapairy.qbk +++ b/doc/distributions/mapairy.qbk @@ -15,10 +15,10 @@ typedef RealType value_type; typedef Policy policy_type; - mapairy_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED mapairy_distribution(RealType location = 0, RealType scale = 1); - RealType location()const; - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType location()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; }; It is special case of a [@http://en.wikipedia.org/wiki/Stable_distribution stable distribution] @@ -50,7 +50,7 @@ the distribution: [h4 Member Functions] - mapairy_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED mapairy_distribution(RealType location = 0, RealType scale = 1); Constructs a mapairy distribution, with location parameter /location/ and scale parameter /scale/. When these parameters take their default @@ -59,11 +59,11 @@ then the result is a Standard map-airy Distribution. Requires scale > 0, otherwise calls __domain_error. - RealType location()const; + BOOST_MATH_GPU_ENABLED RealType location()const; Returns the location parameter of the distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the scale parameter of the distribution. @@ -71,6 +71,8 @@ Returns the scale parameter of the distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. Note however that the map-airy distribution does not have a skewness, kurtosis, etc. See __math_undefined diff --git a/doc/distributions/saspoint5.qbk b/doc/distributions/saspoint5.qbk index 1421b5bac..06efbd32e 100644 --- a/doc/distributions/saspoint5.qbk +++ b/doc/distributions/saspoint5.qbk @@ -15,10 +15,10 @@ typedef RealType value_type; typedef Policy policy_type; - saspoint5_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED saspoint5_distribution(RealType location = 0, RealType scale = 1); - RealType location()const; - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType location()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; }; It is special case of a [@http://en.wikipedia.org/wiki/Stable_distribution stable distribution] @@ -49,7 +49,7 @@ the distribution: [h4 Member Functions] - saspoint5_distribution(RealType location = 0, RealType scale = 1); + BOOST_MATH_GPU_ENABLED saspoint5_distribution(RealType location = 0, RealType scale = 1); Constructs a S[alpha]S Point5 distribution, with location parameter /location/ and scale parameter /scale/. When these parameters take their default @@ -58,11 +58,11 @@ then the result is a Standard S[alpha]S Point5 Distribution. Requires scale > 0, otherwise calls __domain_error. - RealType location()const; + BOOST_MATH_GPU_ENABLED RealType location()const; Returns the location parameter of the distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the scale parameter of the distribution. @@ -70,6 +70,8 @@ Returns the scale parameter of the distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. Note however that the S[alpha]S Point5 distribution does not have a mean, standard deviation, etc. See __math_undefined diff --git a/doc/distributions/weibull.qbk b/doc/distributions/weibull.qbk index 95c9e461e..5d7c11b5f 100644 --- a/doc/distributions/weibull.qbk +++ b/doc/distributions/weibull.qbk @@ -17,10 +17,10 @@ typedef RealType value_type; typedef Policy policy_type; // Construct: - weibull_distribution(RealType shape, RealType scale = 1) + BOOST_MATH_GPU_ENABLED weibull_distribution(RealType shape, RealType scale = 1) // Accessors: - RealType shape()const; - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType shape()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; }; }} // namespaces @@ -65,7 +65,7 @@ Samuel Kotz & Saralees Nadarajah]. [h4 Member Functions] - weibull_distribution(RealType shape, RealType scale = 1); + BOOST_MATH_GPU_ENABLED weibull_distribution(RealType shape, RealType scale = 1); Constructs a [@http://en.wikipedia.org/wiki/Weibull_distribution Weibull distribution] with shape /shape/ and scale /scale/. @@ -73,11 +73,11 @@ Weibull distribution] with shape /shape/ and scale /scale/. Requires that the /shape/ and /scale/ parameters are both greater than zero, otherwise calls __domain_error. - RealType shape()const; + BOOST_MATH_GPU_ENABLED RealType shape()const; Returns the /shape/ parameter of this distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the /scale/ parameter of this distribution. @@ -85,6 +85,8 @@ Returns the /scale/ parameter of this distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The domain of the random variable is \[0, [infin]\]. From 539c81b15ac4f604b36eac06502716bf9dadbb03 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 30 Aug 2024 16:08:41 -0400 Subject: [PATCH 04/31] Add markers to special functions --- doc/overview/gpu.qbk | 5 +++ doc/sf/bessel_ik.qbk | 8 ++--- doc/sf/bessel_jy.qbk | 8 ++--- doc/sf/bessel_spherical.qbk | 8 ++--- doc/sf/beta.qbk | 4 +-- doc/sf/beta_derivative.qbk | 4 +-- doc/sf/digamma.qbk | 4 +-- doc/sf/erf.qbk | 16 ++++----- doc/sf/erf_inv.qbk | 16 ++++----- doc/sf/gamma_derivatives.qbk | 4 +-- doc/sf/gamma_ratios.qbk | 16 ++++----- doc/sf/ibeta.qbk | 32 +++++++++--------- doc/sf/ibeta_inv.qbk | 64 ++++++++++++++++++------------------ doc/sf/igamma.qbk | 32 +++++++++--------- doc/sf/igamma_inv.qbk | 32 +++++++++--------- doc/sf/lgamma.qbk | 8 ++--- doc/sf/pow.qbk | 4 +-- doc/sf/sinc.qbk | 12 +++---- doc/sf/tgamma.qbk | 16 ++++----- doc/sf/trigamma.qbk | 4 +-- 20 files changed, 151 insertions(+), 146 deletions(-) diff --git a/doc/overview/gpu.qbk b/doc/overview/gpu.qbk index 70f0164e0..18ebaba2a 100644 --- a/doc/overview/gpu.qbk +++ b/doc/overview/gpu.qbk @@ -6,6 +6,11 @@ Selected functions, distributions, tools, etc. support running on both host and These functions will have the annotation `BOOST_MATH_GPU_ENABLED` next to their individual documentation. We test using CUDA (both NVCC and NVRTC) as well as SYCL to provide a wide range of support. +[h4 Policies] + +The default policy on all devices is ignore error due to the lack of throwing ability. +A user can specify their own policy like usual, but when the code is run on device it will be ignored. + [h4 How to build with device support] When compiling with CUDA or SYCL you will have to ensure that your code is being run inside of a kernel function. diff --git a/doc/sf/bessel_ik.qbk b/doc/sf/bessel_ik.qbk index d044ac7b8..9fa4e63a7 100644 --- a/doc/sf/bessel_ik.qbk +++ b/doc/sf/bessel_ik.qbk @@ -5,16 +5,16 @@ `#include ` template - ``__sf_result`` cyl_bessel_i(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_i(T1 v, T2 x); template - ``__sf_result`` cyl_bessel_i(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_i(T1 v, T2 x, const ``__Policy``&); template - ``__sf_result`` cyl_bessel_k(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_k(T1 v, T2 x); template - ``__sf_result`` cyl_bessel_k(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_k(T1 v, T2 x, const ``__Policy``&); [h4 Description] diff --git a/doc/sf/bessel_jy.qbk b/doc/sf/bessel_jy.qbk index 1f43bc758..faf878850 100644 --- a/doc/sf/bessel_jy.qbk +++ b/doc/sf/bessel_jy.qbk @@ -5,16 +5,16 @@ `#include ` template - ``__sf_result`` cyl_bessel_j(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_j(T1 v, T2 x); template - ``__sf_result`` cyl_bessel_j(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_j(T1 v, T2 x, const ``__Policy``&); template - ``__sf_result`` cyl_neumann(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_neumann(T1 v, T2 x); template - ``__sf_result`` cyl_neumann(T1 v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_neumann(T1 v, T2 x, const ``__Policy``&); [h4 Description] diff --git a/doc/sf/bessel_spherical.qbk b/doc/sf/bessel_spherical.qbk index e9cda89c7..eb1fa6915 100644 --- a/doc/sf/bessel_spherical.qbk +++ b/doc/sf/bessel_spherical.qbk @@ -5,16 +5,16 @@ `#include ` template - ``__sf_result`` sph_bessel(unsigned v, T2 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` sph_bessel(unsigned v, T2 x); template - ``__sf_result`` sph_bessel(unsigned v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` sph_bessel(unsigned v, T2 x, const ``__Policy``&); template - ``__sf_result`` sph_neumann(unsigned v, T2 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` sph_neumann(unsigned v, T2 x); template - ``__sf_result`` sph_neumann(unsigned v, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` sph_neumann(unsigned v, T2 x, const ``__Policy``&); [h4 Description] diff --git a/doc/sf/beta.qbk b/doc/sf/beta.qbk index e332fa503..7e1904c25 100644 --- a/doc/sf/beta.qbk +++ b/doc/sf/beta.qbk @@ -9,10 +9,10 @@ namespace boost{ namespace math{ template - ``__sf_result`` beta(T1 a, T2 b); + BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b); template - ``__sf_result`` beta(T1 a, T2 b, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b, const ``__Policy``&); }} // namespaces diff --git a/doc/sf/beta_derivative.qbk b/doc/sf/beta_derivative.qbk index 8606d6f2b..5d3b9a13e 100644 --- a/doc/sf/beta_derivative.qbk +++ b/doc/sf/beta_derivative.qbk @@ -9,10 +9,10 @@ namespace boost{ namespace math{ template - ``__sf_result`` ibeta_derivative(T1 a, T2 b, T3 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_derivative(T1 a, T2 b, T3 x); template - ``__sf_result`` ibeta_derivative(T1 a, T2 b, T3 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_derivative(T1 a, T2 b, T3 x, const ``__Policy``&); }} // namespaces diff --git a/doc/sf/digamma.qbk b/doc/sf/digamma.qbk index c88c5fe7b..78b68403d 100644 --- a/doc/sf/digamma.qbk +++ b/doc/sf/digamma.qbk @@ -9,10 +9,10 @@ namespace boost{ namespace math{ template - ``__sf_result`` digamma(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` digamma(T z); template - ``__sf_result`` digamma(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` digamma(T z, const ``__Policy``&); }} // namespaces diff --git a/doc/sf/erf.qbk b/doc/sf/erf.qbk index 3207b66c0..5f6bdf9fa 100644 --- a/doc/sf/erf.qbk +++ b/doc/sf/erf.qbk @@ -9,16 +9,16 @@ namespace boost{ namespace math{ template - ``__sf_result`` erf(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erf(T z); template - ``__sf_result`` erf(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erf(T z, const ``__Policy``&); template - ``__sf_result`` erfc(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc(T z); template - ``__sf_result`` erfc(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc(T z, const ``__Policy``&); }} // namespaces @@ -30,10 +30,10 @@ the return type is `double` if T is an integer type, and T otherwise. [h4 Description] template - ``__sf_result`` erf(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erf(T z); template - ``__sf_result`` erf(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erf(T z, const ``__Policy``&); Returns the [@http://en.wikipedia.org/wiki/Error_function error function] [@http://functions.wolfram.com/GammaBetaErf/Erf/ erf] of z: @@ -43,10 +43,10 @@ Returns the [@http://en.wikipedia.org/wiki/Error_function error function] [graph erf] template - ``__sf_result`` erfc(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc(T z); template - ``__sf_result`` erfc(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc(T z, const ``__Policy``&); Returns the complement of the [@http://functions.wolfram.com/GammaBetaErf/Erfc/ error function] of z: diff --git a/doc/sf/erf_inv.qbk b/doc/sf/erf_inv.qbk index 729ec22d2..e8f7464e0 100644 --- a/doc/sf/erf_inv.qbk +++ b/doc/sf/erf_inv.qbk @@ -9,16 +9,16 @@ namespace boost{ namespace math{ template - ``__sf_result`` erf_inv(T p); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erf_inv(T p); template - ``__sf_result`` erf_inv(T p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erf_inv(T p, const ``__Policy``&); template - ``__sf_result`` erfc_inv(T p); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc_inv(T p); template - ``__sf_result`` erfc_inv(T p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc_inv(T p, const ``__Policy``&); }} // namespaces @@ -30,10 +30,10 @@ the return type is `double` if T is an integer type, and T otherwise. [h4 Description] template - ``__sf_result`` erf_inv(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erf_inv(T z); template - ``__sf_result`` erf_inv(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erf_inv(T z, const ``__Policy``&); Returns the [@http://functions.wolfram.com/GammaBetaErf/InverseErf/ inverse error function] of z, that is a value x such that: @@ -43,10 +43,10 @@ of z, that is a value x such that: [graph erf_inv] template - ``__sf_result`` erfc_inv(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc_inv(T z); template - ``__sf_result`` erfc_inv(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc_inv(T z, const ``__Policy``&); Returns the inverse of the complement of the error function of z, that is a value x such that: diff --git a/doc/sf/gamma_derivatives.qbk b/doc/sf/gamma_derivatives.qbk index c7dd24879..1b578d8d9 100644 --- a/doc/sf/gamma_derivatives.qbk +++ b/doc/sf/gamma_derivatives.qbk @@ -9,10 +9,10 @@ namespace boost{ namespace math{ template - ``__sf_result`` gamma_p_derivative(T1 a, T2 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_derivative(T1 a, T2 x); template - ``__sf_result`` gamma_p_derivative(T1 a, T2 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_derivative(T1 a, T2 x, const ``__Policy``&); }} // namespaces diff --git a/doc/sf/gamma_ratios.qbk b/doc/sf/gamma_ratios.qbk index a3fcf864c..0d076890d 100644 --- a/doc/sf/gamma_ratios.qbk +++ b/doc/sf/gamma_ratios.qbk @@ -7,26 +7,26 @@ namespace boost{ namespace math{ template - ``__sf_result`` tgamma_ratio(T1 a, T2 b); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_ratio(T1 a, T2 b); template - ``__sf_result`` tgamma_ratio(T1 a, T2 b, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_ratio(T1 a, T2 b, const ``__Policy``&); template - ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta); template - ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta, const ``__Policy``&); }} // namespaces [h4 Description] template - ``__sf_result`` tgamma_ratio(T1 a, T2 b); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_ratio(T1 a, T2 b); template - ``__sf_result`` tgamma_ratio(T1 a, T2 b, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_ratio(T1 a, T2 b, const ``__Policy``&); Returns the ratio of gamma functions: @@ -37,10 +37,10 @@ Returns the ratio of gamma functions: Internally this just calls `tgamma_delta_ratio(a, b-a)`. template - ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta); template - ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta, const ``__Policy``&); Returns the ratio of gamma functions: diff --git a/doc/sf/ibeta.qbk b/doc/sf/ibeta.qbk index b4a20f928..5227b2d34 100644 --- a/doc/sf/ibeta.qbk +++ b/doc/sf/ibeta.qbk @@ -9,28 +9,28 @@ namespace boost{ namespace math{ template - ``__sf_result`` ibeta(T1 a, T2 b, T3 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta(T1 a, T2 b, T3 x); template - ``__sf_result`` ibeta(T1 a, T2 b, T3 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta(T1 a, T2 b, T3 x, const ``__Policy``&); template - ``__sf_result`` ibetac(T1 a, T2 b, T3 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac(T1 a, T2 b, T3 x); template - ``__sf_result`` ibetac(T1 a, T2 b, T3 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac(T1 a, T2 b, T3 x, const ``__Policy``&); template - ``__sf_result`` beta(T1 a, T2 b, T3 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b, T3 x); template - ``__sf_result`` beta(T1 a, T2 b, T3 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b, T3 x, const ``__Policy``&); template - ``__sf_result`` betac(T1 a, T2 b, T3 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` betac(T1 a, T2 b, T3 x); template - ``__sf_result`` betac(T1 a, T2 b, T3 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` betac(T1 a, T2 b, T3 x, const ``__Policy``&); }} // namespaces @@ -57,10 +57,10 @@ when T1, T2 and T3 are different types. [optional_policy] template - ``__sf_result`` ibeta(T1 a, T2 b, T3 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta(T1 a, T2 b, T3 x); template - ``__sf_result`` ibeta(T1 a, T2 b, T3 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta(T1 a, T2 b, T3 x, const ``__Policy``&); Returns the normalised incomplete beta function of a, b and x: @@ -69,30 +69,30 @@ Returns the normalised incomplete beta function of a, b and x: [graph ibeta] template - ``__sf_result`` ibetac(T1 a, T2 b, T3 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac(T1 a, T2 b, T3 x); template - ``__sf_result`` ibetac(T1 a, T2 b, T3 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac(T1 a, T2 b, T3 x, const ``__Policy``&); Returns the normalised complement of the incomplete beta function of a, b and x: [equation ibeta4] template - ``__sf_result`` beta(T1 a, T2 b, T3 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b, T3 x); template - ``__sf_result`` beta(T1 a, T2 b, T3 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b, T3 x, const ``__Policy``&); Returns the full (non-normalised) incomplete beta function of a, b and x: [equation ibeta1] template - ``__sf_result`` betac(T1 a, T2 b, T3 x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` betac(T1 a, T2 b, T3 x); template - ``__sf_result`` betac(T1 a, T2 b, T3 x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` betac(T1 a, T2 b, T3 x, const ``__Policy``&); Returns the full (non-normalised) complement of the incomplete beta function of a, b and x: diff --git a/doc/sf/ibeta_inv.qbk b/doc/sf/ibeta_inv.qbk index 83c2b0008..60049db46 100644 --- a/doc/sf/ibeta_inv.qbk +++ b/doc/sf/ibeta_inv.qbk @@ -7,52 +7,52 @@ namespace boost{ namespace math{ template - ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p); template - ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, const ``__Policy``&); template - ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py); template - ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py, const ``__Policy``&); template - ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q); template - ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, const ``__Policy``&); template - ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py); template - ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py, const ``__Policy``&); template - ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p); template - ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p, const ``__Policy``&); template - ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 q); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 q); template - ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 q, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 q, const ``__Policy``&); template - ``__sf_result`` ibeta_invb(T1 a, T2 x, T3 p); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_invb(T1 a, T2 x, T3 p); template - ``__sf_result`` ibeta_invb(T1 a, T2 x, T3 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_invb(T1 a, T2 x, T3 p, const ``__Policy``&); template - ``__sf_result`` ibetac_invb(T1 a, T2 x, T3 q); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_invb(T1 a, T2 x, T3 q); template - ``__sf_result`` ibetac_invb(T1 a, T2 x, T3 q, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_invb(T1 a, T2 x, T3 q, const ``__Policy``&); }} // namespaces @@ -81,16 +81,16 @@ The return type of these functions is computed using the __arg_promotion_rules when called with arguments T1...TN of different types. template - ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p); template - ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, const ``__Policy``&); template - ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py); template - ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py, const ``__Policy``&); Returns a value /x/ such that: `p = ibeta(a, b, x);` and sets `*py = 1 - x` when the `py` parameter is provided and is non-null. @@ -104,16 +104,16 @@ Requires: /a,b > 0/ and /0 <= p <= 1/. [optional_policy] template - ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q); + BOOST_MATH_GPU_ENABLED``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q); template - ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, const ``__Policy``&); template - ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py); template - ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py, const ``__Policy``&); Returns a value /x/ such that: `q = ibetac(a, b, x);` and sets `*py = 1 - x` when the `py` parameter is provided and is non-null. @@ -127,10 +127,10 @@ Requires: /a,b > 0/ and /0 <= q <= 1/. [optional_policy] template - ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p); + BOOST_MATH_GPU_ENABLED``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p); template - ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p, const ``__Policy``&); Returns a value /a/ such that: `p = ibeta(a, b, x);` @@ -139,10 +139,10 @@ Requires: /b > 0/, /0 < x < 1/ and /0 <= p <= 1/. [optional_policy] template - ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 p); + BOOST_MATH_GPU_ENABLED``__sf_result`` ibetac_inva(T1 b, T2 x, T3 p); template - ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED``__sf_result`` ibetac_inva(T1 b, T2 x, T3 p, const ``__Policy``&); Returns a value /a/ such that: `q = ibetac(a, b, x);` @@ -151,10 +151,10 @@ Requires: /b > 0/, /0 < x < 1/ and /0 <= q <= 1/. [optional_policy] template - ``__sf_result`` ibeta_invb(T1 b, T2 x, T3 p); + BOOST_MATH_GPU_ENABLED``__sf_result`` ibeta_invb(T1 b, T2 x, T3 p); template - ``__sf_result`` ibeta_invb(T1 b, T2 x, T3 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED``__sf_result`` ibeta_invb(T1 b, T2 x, T3 p, const ``__Policy``&); Returns a value /b/ such that: `p = ibeta(a, b, x);` @@ -163,10 +163,10 @@ Requires: /a > 0/, /0 < x < 1/ and /0 <= p <= 1/. [optional_policy] template - ``__sf_result`` ibetac_invb(T1 b, T2 x, T3 p); + BOOST_MATH_GPU_ENABLED``__sf_result`` ibetac_invb(T1 b, T2 x, T3 p); template - ``__sf_result`` ibetac_invb(T1 b, T2 x, T3 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_invb(T1 b, T2 x, T3 p, const ``__Policy``&); Returns a value /b/ such that: `q = ibetac(a, b, x);` diff --git a/doc/sf/igamma.qbk b/doc/sf/igamma.qbk index ca354ad10..4675928e6 100644 --- a/doc/sf/igamma.qbk +++ b/doc/sf/igamma.qbk @@ -9,28 +9,28 @@ namespace boost{ namespace math{ template - ``__sf_result`` gamma_p(T1 a, T2 z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p(T1 a, T2 z); template - ``__sf_result`` gamma_p(T1 a, T2 z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p(T1 a, T2 z, const ``__Policy``&); template - ``__sf_result`` gamma_q(T1 a, T2 z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q(T1 a, T2 z); template - ``__sf_result`` gamma_q(T1 a, T2 z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q(T1 a, T2 z, const ``__Policy``&); template - ``__sf_result`` tgamma_lower(T1 a, T2 z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_lower(T1 a, T2 z); template - ``__sf_result`` tgamma_lower(T1 a, T2 z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_lower(T1 a, T2 z, const ``__Policy``&); template - ``__sf_result`` tgamma(T1 a, T2 z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T1 a, T2 z); template - ``__sf_result`` tgamma(T1 a, T2 z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T1 a, T2 z, const ``__Policy``&); }} // namespaces @@ -53,10 +53,10 @@ The return type of these functions is computed using the __arg_promotion_rules when T1 and T2 are different types, otherwise the return type is simply T1. template - ``__sf_result`` gamma_p(T1 a, T2 z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p(T1 a, T2 z); template - ``__sf_result`` gamma_p(T1 a, T2 z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p(T1 a, T2 z, const ``__Policy``&); Returns the normalised lower incomplete gamma function of a and z: @@ -67,10 +67,10 @@ This function changes rapidly from 0 to 1 around the point z == a: [graph gamma_p] template - ``__sf_result`` gamma_q(T1 a, T2 z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q(T1 a, T2 z); template - ``__sf_result`` gamma_q(T1 a, T2 z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q(T1 a, T2 z, const ``__Policy``&); Returns the normalised upper incomplete gamma function of a and z: @@ -81,20 +81,20 @@ This function changes rapidly from 1 to 0 around the point z == a: [graph gamma_q] template - ``__sf_result`` tgamma_lower(T1 a, T2 z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_lower(T1 a, T2 z); template - ``__sf_result`` tgamma_lower(T1 a, T2 z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_lower(T1 a, T2 z, const ``__Policy``&); Returns the full (non-normalised) lower incomplete gamma function of a and z: [equation igamma2] template - ``__sf_result`` tgamma(T1 a, T2 z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T1 a, T2 z); template - ``__sf_result`` tgamma(T1 a, T2 z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T1 a, T2 z, const ``__Policy``&); Returns the full (non-normalised) upper incomplete gamma function of a and z: diff --git a/doc/sf/igamma_inv.qbk b/doc/sf/igamma_inv.qbk index 593c92141..55fe76e6e 100644 --- a/doc/sf/igamma_inv.qbk +++ b/doc/sf/igamma_inv.qbk @@ -9,28 +9,28 @@ namespace boost{ namespace math{ template - ``__sf_result`` gamma_q_inv(T1 a, T2 q); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inv(T1 a, T2 q); template - ``__sf_result`` gamma_q_inv(T1 a, T2 q, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inv(T1 a, T2 q, const ``__Policy``&); template - ``__sf_result`` gamma_p_inv(T1 a, T2 p); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inv(T1 a, T2 p); template - ``__sf_result`` gamma_p_inv(T1 a, T2 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inv(T1 a, T2 p, const ``__Policy``&); template - ``__sf_result`` gamma_q_inva(T1 x, T2 q); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inva(T1 x, T2 q); template - ``__sf_result`` gamma_q_inva(T1 x, T2 q, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inva(T1 x, T2 q, const ``__Policy``&); template - ``__sf_result`` gamma_p_inva(T1 x, T2 p); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inva(T1 x, T2 p); template - ``__sf_result`` gamma_p_inva(T1 x, T2 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inva(T1 x, T2 p, const ``__Policy``&); }} // namespaces @@ -58,40 +58,40 @@ These are implemented here as `gamma_p_inva` and `gamma_q_inva`.] template - ``__sf_result`` gamma_q_inv(T1 a, T2 q); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inv(T1 a, T2 q); template - ``__sf_result`` gamma_q_inv(T1 a, T2 q, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inv(T1 a, T2 q, const ``__Policy``&); Returns a value x such that: `q = gamma_q(a, x);` Requires: /a > 0/ and /1 >= p,q >= 0/. template - ``__sf_result`` gamma_p_inv(T1 a, T2 p); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inv(T1 a, T2 p); template - ``__sf_result`` gamma_p_inv(T1 a, T2 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inv(T1 a, T2 p, const ``__Policy``&); Returns a value x such that: `p = gamma_p(a, x);` Requires: /a > 0/ and /1 >= p,q >= 0/. template - ``__sf_result`` gamma_q_inva(T1 x, T2 q); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inva(T1 x, T2 q); template - ``__sf_result`` gamma_q_inva(T1 x, T2 q, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inva(T1 x, T2 q, const ``__Policy``&); Returns a value a such that: `q = gamma_q(a, x);` Requires: /x > 0/ and /1 >= p,q >= 0/. template - ``__sf_result`` gamma_p_inva(T1 x, T2 p); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inva(T1 x, T2 p); template - ``__sf_result`` gamma_p_inva(T1 x, T2 p, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inva(T1 x, T2 p, const ``__Policy``&); Returns a value a such that: `p = gamma_p(a, x);` diff --git a/doc/sf/lgamma.qbk b/doc/sf/lgamma.qbk index 5ea1a4e09..544485c7c 100644 --- a/doc/sf/lgamma.qbk +++ b/doc/sf/lgamma.qbk @@ -9,16 +9,16 @@ namespace boost{ namespace math{ template - ``__sf_result`` lgamma(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` lgamma(T z); template - ``__sf_result`` lgamma(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` lgamma(T z, const ``__Policy``&); template - ``__sf_result`` lgamma(T z, int* sign); + BOOST_MATH_GPU_ENABLED ``__sf_result`` lgamma(T z, int* sign); template - ``__sf_result`` lgamma(T z, int* sign, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` lgamma(T z, int* sign, const ``__Policy``&); }} // namespaces diff --git a/doc/sf/pow.qbk b/doc/sf/pow.qbk index db021978e..ecb762d71 100644 --- a/doc/sf/pow.qbk +++ b/doc/sf/pow.qbk @@ -10,10 +10,10 @@ power of a run-time base. namespace boost { namespace math { template - constexpr ``__sf_result`` pow(T base); + BOOST_MATH_GPU_ENABLED constexpr ``__sf_result`` pow(T base); template - constexpr ``__sf_result`` pow(T base, const Policy& policy); + BOOST_MATH_GPU_ENABLED constexpr ``__sf_result`` pow(T base, const Policy& policy); }} diff --git a/doc/sf/sinc.qbk b/doc/sf/sinc.qbk index b345c08cd..a6042a717 100644 --- a/doc/sf/sinc.qbk +++ b/doc/sf/sinc.qbk @@ -43,16 +43,16 @@ and [@http://mathworld.wolfram.com/Octonion.html octonions]. `` template - ``__sf_result`` sinc_pi(const T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` sinc_pi(const T x); template - ``__sf_result`` sinc_pi(const T x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` sinc_pi(const T x, const ``__Policy``&); template class U> - U sinc_pi(const U x); + BOOST_MATH_GPU_ENABLED U sinc_pi(const U x); template class U, class ``__Policy``> - U sinc_pi(const U x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED U sinc_pi(const U x, const ``__Policy``&); Computes [link math_toolkit.sinc.sinc_overview @@ -78,10 +78,10 @@ to ensure accuracy. `` template - ``__sf_result`` sinhc_pi(const T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` sinhc_pi(const T x); template - ``__sf_result`` sinhc_pi(const T x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` sinhc_pi(const T x, const ``__Policy``&); template class U> U sinhc_pi(const U x); diff --git a/doc/sf/tgamma.qbk b/doc/sf/tgamma.qbk index 7eb535ec3..23baad2cb 100644 --- a/doc/sf/tgamma.qbk +++ b/doc/sf/tgamma.qbk @@ -9,26 +9,26 @@ namespace boost{ namespace math{ template - ``__sf_result`` tgamma(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T z); template - ``__sf_result`` tgamma(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T z, const ``__Policy``&); template - ``__sf_result`` tgamma1pm1(T dz); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma1pm1(T dz); template - ``__sf_result`` tgamma1pm1(T dz, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma1pm1(T dz, const ``__Policy``&); }} // namespaces [h4 Description] template - ``__sf_result`` tgamma(T z); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T z); template - ``__sf_result`` tgamma(T z, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T z, const ``__Policy``&); Returns the "true gamma" (hence name tgamma) of value z: @@ -42,10 +42,10 @@ The return type of this function is computed using the __arg_promotion_rules: the result is `double` when T is an integer type, and T otherwise. template - ``__sf_result`` tgamma1pm1(T dz); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma1pm1(T dz); template - ``__sf_result`` tgamma1pm1(T dz, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma1pm1(T dz, const ``__Policy``&); Returns `tgamma(dz + 1) - 1`. Internally the implementation does not make use of the addition and subtraction implied by the definition, leading to diff --git a/doc/sf/trigamma.qbk b/doc/sf/trigamma.qbk index 137a148d8..a358c8571 100644 --- a/doc/sf/trigamma.qbk +++ b/doc/sf/trigamma.qbk @@ -9,10 +9,10 @@ namespace boost{ namespace math{ template - ``__sf_result`` trigamma(T x); + BOOST_MATH_GPU_ENABLED ``__sf_result`` trigamma(T x); template - ``__sf_result`` trigamma(T x, const ``__Policy``&); + BOOST_MATH_GPU_ENABLED ``__sf_result`` trigamma(T x, const ``__Policy``&); }} // namespaces From 2729683b09b7243f33f21ede21f10bde5d4fd811 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 30 Aug 2024 16:11:45 -0400 Subject: [PATCH 05/31] Add markers to Newton-Rhapson --- doc/roots/roots.qbk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/roots/roots.qbk b/doc/roots/roots.qbk index a22930069..ee3619f4f 100644 --- a/doc/roots/roots.qbk +++ b/doc/roots/roots.qbk @@ -10,10 +10,10 @@ namespace tools { // Note namespace boost::math::tools. // Newton-Raphson template - T newton_raphson_iterate(F f, T guess, T min, T max, int digits); + BOOST_MATH_GPU_ENABLED T newton_raphson_iterate(F f, T guess, T min, T max, int digits); template - T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& max_iter); + BOOST_MATH_GPU_ENABLED T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& max_iter); // Halley template From b13fcb07c13a62dc2461563808866c452a727c56 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 30 Aug 2024 16:14:06 -0400 Subject: [PATCH 06/31] Replace broken umalut-o with oe (Ersatzschreibung) --- doc/roots/roots.qbk | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/roots/roots.qbk b/doc/roots/roots.qbk index ee3619f4f..ea347639b 100644 --- a/doc/roots/roots.qbk +++ b/doc/roots/roots.qbk @@ -1,4 +1,4 @@ -[section:roots_deriv Root Finding With Derivatives: Newton-Raphson, Halley & Schr'''ö'''der] +[section:roots_deriv Root Finding With Derivatives: Newton-Raphson, Halley & Schroeder] [h4 Synopsis] @@ -22,7 +22,7 @@ template T halley_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& max_iter); - // Schr'''ö'''der + // Schroeder template T schroder_iterate(F f, T guess, T min, T max, int digits); @@ -61,7 +61,7 @@ For second-order iterative method ([@http://en.wikipedia.org/wiki/Newton_Raphson For the third-order methods ([@http://en.wikipedia.org/wiki/Halley%27s_method Halley] and -Schr'''ö'''der) +Schroeder) the `tuple` should have [*three] elements containing the evaluation of the function and its first and second derivatives.]] [[T guess] [The initial starting value. A good guess is crucial to quick convergence!]] @@ -147,7 +147,7 @@ Out of bounds steps revert to bisection of the current bounds. Under ideal conditions, the number of correct digits trebles with each iteration. -[h4:schroder Schr'''ö'''der's Method] +[h4:schroder Schroeder's Method] Given an initial guess x0 the subsequent values are computed using: @@ -162,8 +162,8 @@ Out of bounds steps revert to __bisection_wikipedia of the current bounds. Under ideal conditions, the number of correct digits trebles with each iteration. -This is Schr'''ö'''der's general result (equation 18 from [@http://drum.lib.umd.edu/handle/1903/577 Stewart, G. W. -"On Infinitely Many Algorithms for Solving Equations." English translation of Schr'''ö'''der's original paper. +This is Schroeder's general result (equation 18 from [@http://drum.lib.umd.edu/handle/1903/577 Stewart, G. W. +"On Infinitely Many Algorithms for Solving Equations." English translation of Schroeder's original paper. College Park, MD: University of Maryland, Institute for Advanced Computer Studies, Department of Computer Science, 1993].) This method guarantees at least quadratic convergence (the same as Newton's method), and is known to work well in the presence of multiple roots: From db0dfc7531e959f0628cb84a2bf4fee45c6f1acc Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 10:33:56 -0400 Subject: [PATCH 07/31] Fix missing end section --- doc/overview/gpu.qbk | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/overview/gpu.qbk b/doc/overview/gpu.qbk index 18ebaba2a..b97b059a6 100644 --- a/doc/overview/gpu.qbk +++ b/doc/overview/gpu.qbk @@ -55,6 +55,8 @@ And lastly on SYCL: Once your kernel function has been written then use the framework mechanism for launching the kernel. +[endsect] [/section:gpu Support for GPU programming in Boost.Math] + [/ Copyright 2024. Matt Borland Distributed under the Boost Software License, Version 1.0. From 81cf65ccd6ebb4c0c4303a8b5efaa9671cb64808 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 10:48:07 -0400 Subject: [PATCH 08/31] Add GPU markers to fisher f dist --- include/boost/math/distributions/fisher_f.hpp | 69 ++++++++++--------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/include/boost/math/distributions/fisher_f.hpp b/include/boost/math/distributions/fisher_f.hpp index e22cdf50a..56b288d88 100644 --- a/include/boost/math/distributions/fisher_f.hpp +++ b/include/boost/math/distributions/fisher_f.hpp @@ -1,5 +1,5 @@ // Copyright John Maddock 2006. - +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt @@ -8,14 +8,15 @@ #ifndef BOOST_MATH_DISTRIBUTIONS_FISHER_F_HPP #define BOOST_MATH_DISTRIBUTIONS_FISHER_F_HPP +#include +#include +#include #include #include // for incomplete beta. #include // complements #include // error checks #include -#include - namespace boost{ namespace math{ template > @@ -25,9 +26,9 @@ class fisher_f_distribution typedef RealType value_type; typedef Policy policy_type; - fisher_f_distribution(const RealType& i, const RealType& j) : m_df1(i), m_df2(j) + BOOST_MATH_GPU_ENABLED fisher_f_distribution(const RealType& i, const RealType& j) : m_df1(i), m_df2(j) { - static const char* function = "fisher_f_distribution<%1%>::fisher_f_distribution"; + constexpr auto function = "fisher_f_distribution<%1%>::fisher_f_distribution"; RealType result; detail::check_df( function, m_df1, &result, Policy()); @@ -35,11 +36,11 @@ class fisher_f_distribution function, m_df2, &result, Policy()); } // fisher_f_distribution - RealType degrees_of_freedom1()const + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom1()const { return m_df1; } - RealType degrees_of_freedom2()const + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom2()const { return m_df2; } @@ -60,29 +61,29 @@ fisher_f_distribution(RealType,RealType)->fisher_f_distribution -inline const std::pair range(const fisher_f_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline const boost::math::pair range(const fisher_f_distribution& /*dist*/) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -inline const std::pair support(const fisher_f_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline const boost::math::pair support(const fisher_f_distribution& /*dist*/) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -RealType pdf(const fisher_f_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED RealType pdf(const fisher_f_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: RealType error_result = 0; - static const char* function = "boost::math::pdf(fisher_f_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::pdf(fisher_f_distribution<%1%> const&, %1%)"; if(false == (detail::check_df( function, df1, &error_result, Policy()) && detail::check_df( @@ -132,9 +133,9 @@ RealType pdf(const fisher_f_distribution& dist, const RealType } // pdf template -inline RealType cdf(const fisher_f_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const fisher_f_distribution& dist, const RealType& x) { - static const char* function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)"; RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: @@ -167,9 +168,9 @@ inline RealType cdf(const fisher_f_distribution& dist, const R } // cdf template -inline RealType quantile(const fisher_f_distribution& dist, const RealType& p) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const fisher_f_distribution& dist, const RealType& p) { - static const char* function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)"; RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: @@ -192,9 +193,9 @@ inline RealType quantile(const fisher_f_distribution& dist, co } // quantile template -inline RealType cdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { - static const char* function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)"; RealType df1 = c.dist.degrees_of_freedom1(); RealType df2 = c.dist.degrees_of_freedom2(); RealType x = c.param; @@ -228,9 +229,9 @@ inline RealType cdf(const complemented2_type -inline RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { - static const char* function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)"; RealType df1 = c.dist.degrees_of_freedom1(); RealType df2 = c.dist.degrees_of_freedom2(); RealType p = c.param; @@ -252,9 +253,9 @@ inline RealType quantile(const complemented2_type -inline RealType mean(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mean(const fisher_f_distribution& dist) { // Mean of F distribution = v. - static const char* function = "boost::math::mean(fisher_f_distribution<%1%> const&)"; + constexpr auto function = "boost::math::mean(fisher_f_distribution<%1%> const&)"; RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: @@ -273,9 +274,9 @@ inline RealType mean(const fisher_f_distribution& dist) } // mean template -inline RealType variance(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType variance(const fisher_f_distribution& dist) { // Variance of F distribution. - static const char* function = "boost::math::variance(fisher_f_distribution<%1%> const&)"; + constexpr auto function = "boost::math::variance(fisher_f_distribution<%1%> const&)"; RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: @@ -294,9 +295,9 @@ inline RealType variance(const fisher_f_distribution& dist) } // variance template -inline RealType mode(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mode(const fisher_f_distribution& dist) { - static const char* function = "boost::math::mode(fisher_f_distribution<%1%> const&)"; + constexpr auto function = "boost::math::mode(fisher_f_distribution<%1%> const&)"; RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: @@ -317,15 +318,15 @@ inline RealType mode(const fisher_f_distribution& dist) //template //inline RealType median(const fisher_f_distribution& dist) //{ // Median of Fisher F distribution is not defined. -// return tools::domain_error(BOOST_CURRENT_FUNCTION, "Median is not implemented, result is %1%!", std::numeric_limits::quiet_NaN()); +// return tools::domain_error(BOOST_CURRENT_FUNCTION, "Median is not implemented, result is %1%!", boost::math::numeric_limits::quiet_NaN()); // } // median // Now implemented via quantile(half) in derived accessors. template -inline RealType skewness(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType skewness(const fisher_f_distribution& dist) { - static const char* function = "boost::math::skewness(fisher_f_distribution<%1%> const&)"; + constexpr auto function = "boost::math::skewness(fisher_f_distribution<%1%> const&)"; BOOST_MATH_STD_USING // ADL of std names // See http://mathworld.wolfram.com/F-Distribution.html RealType df1 = dist.degrees_of_freedom1(); @@ -346,18 +347,18 @@ inline RealType skewness(const fisher_f_distribution& dist) } template -RealType kurtosis_excess(const fisher_f_distribution& dist); +BOOST_MATH_GPU_ENABLED RealType kurtosis_excess(const fisher_f_distribution& dist); template -inline RealType kurtosis(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const fisher_f_distribution& dist) { return 3 + kurtosis_excess(dist); } template -inline RealType kurtosis_excess(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const fisher_f_distribution& dist) { - static const char* function = "boost::math::kurtosis_excess(fisher_f_distribution<%1%> const&)"; + constexpr auto function = "boost::math::kurtosis_excess(fisher_f_distribution<%1%> const&)"; // See http://mathworld.wolfram.com/F-Distribution.html RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); From 0882eccc76e3932e83c16516dbcdad39e7879137 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 10:48:17 -0400 Subject: [PATCH 09/31] Add SYCL testing of fisher f dist --- test/sycl_jamfile | 1 + test/test_fisher_f.cpp | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 5d3d85cd8..baf2f95d2 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -17,6 +17,7 @@ run test_cauchy.cpp ; run test_chi_squared.cpp ; run test_exponential_dist.cpp ; run test_extreme_value.cpp ; +run test_fisher_f.cpp ; run test_holtsmark.cpp ; run test_landau.cpp ; run test_laplace.cpp ; diff --git a/test/test_fisher_f.cpp b/test/test_fisher_f.cpp index c18ed8ff1..f142a3327 100644 --- a/test/test_fisher_f.cpp +++ b/test/test_fisher_f.cpp @@ -8,9 +8,13 @@ // (See accompanying file LICENSE_1_0.txt // or copy at http://www.boost.org/LICENSE_1_0.txt) -#include +#include +#include "../include_private/boost/math/tools/test.hpp" + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept using ::boost::math::concepts::real_concept; +#endif #include // for fisher_f_distribution using boost::math::fisher_f_distribution; From 438c2546eaee899a9239bff2ef57accf8a944ac1 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 11:05:57 -0400 Subject: [PATCH 10/31] Add CUDA fisher f dist testing --- test/cuda_jamfile | 7 ++ test/test_fisher_f_cdf_double.cu | 109 ++++++++++++++++++++++++++++++ test/test_fisher_f_cdf_float.cu | 109 ++++++++++++++++++++++++++++++ test/test_fisher_f_pdf_double.cu | 109 ++++++++++++++++++++++++++++++ test/test_fisher_f_pdf_float.cu | 109 ++++++++++++++++++++++++++++++ test/test_fisher_f_quan_double.cu | 109 ++++++++++++++++++++++++++++++ test/test_fisher_f_quan_float.cu | 109 ++++++++++++++++++++++++++++++ 7 files changed, 661 insertions(+) create mode 100644 test/test_fisher_f_cdf_double.cu create mode 100644 test/test_fisher_f_cdf_float.cu create mode 100644 test/test_fisher_f_pdf_double.cu create mode 100644 test/test_fisher_f_pdf_float.cu create mode 100644 test/test_fisher_f_quan_double.cu create mode 100644 test/test_fisher_f_quan_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index c697da8e9..f517f4257 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -65,6 +65,13 @@ run test_extreme_value_pdf_float.cu ; run test_extreme_value_quan_double.cu ; run test_extreme_value_quan_float.cu ; +run test_fisher_f_cdf_double.cu ; +run test_fisher_f_cdf_float.cu ; +run test_fisher_f_pdf_double.cu ; +run test_fisher_f_pdf_float.cu ; +run test_fisher_f_quan_double.cu ; +run test_fisher_f_quan_float.cu ; + run test_holtsmark_cdf_double.cu ; run test_holtsmark_cdf_float.cu ; run test_holtsmark_pdf_double.cu ; diff --git a/test/test_fisher_f_cdf_double.cu b/test/test_fisher_f_cdf_double.cu new file mode 100644 index 000000000..877961166 --- /dev/null +++ b/test/test_fisher_f_cdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_cdf_float.cu b/test/test_fisher_f_cdf_float.cu new file mode 100644 index 000000000..a6fcc9f98 --- /dev/null +++ b/test/test_fisher_f_cdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_pdf_double.cu b/test/test_fisher_f_pdf_double.cu new file mode 100644 index 000000000..e4ae50791 --- /dev/null +++ b/test/test_fisher_f_pdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_pdf_float.cu b/test/test_fisher_f_pdf_float.cu new file mode 100644 index 000000000..7b7583736 --- /dev/null +++ b/test/test_fisher_f_pdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_quan_double.cu b/test/test_fisher_f_quan_double.cu new file mode 100644 index 000000000..42bcb0dac --- /dev/null +++ b/test/test_fisher_f_quan_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_quan_float.cu b/test/test_fisher_f_quan_float.cu new file mode 100644 index 000000000..3a0bc688b --- /dev/null +++ b/test/test_fisher_f_quan_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file From ee54ae0a93eeae2e3a0e8ab685eee73cc997986f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 11:25:00 -0400 Subject: [PATCH 11/31] Add NVRTC fisher f dist testing --- test/nvrtc_jamfile | 7 + test/test_fisher_f_cdf_nvrtc_double.cpp | 191 +++++++++++++++++++++++ test/test_fisher_f_cdf_nvrtc_float.cpp | 191 +++++++++++++++++++++++ test/test_fisher_f_pdf_nvrtc_double.cpp | 191 +++++++++++++++++++++++ test/test_fisher_f_pdf_nvrtc_float.cpp | 191 +++++++++++++++++++++++ test/test_fisher_f_quan_nvrtc_double.cpp | 191 +++++++++++++++++++++++ test/test_fisher_f_quan_nvrtc_float.cpp | 191 +++++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_fisher_f_cdf_nvrtc_double.cpp create mode 100644 test/test_fisher_f_cdf_nvrtc_float.cpp create mode 100644 test/test_fisher_f_pdf_nvrtc_double.cpp create mode 100644 test/test_fisher_f_pdf_nvrtc_float.cpp create mode 100644 test/test_fisher_f_quan_nvrtc_double.cpp create mode 100644 test/test_fisher_f_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 1fc2746a1..438e41c88 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -59,6 +59,13 @@ run test_extreme_value_pdf_nvrtc_float.cpp ; run test_extreme_value_quan_nvrtc_double.cpp ; run test_extreme_value_quan_nvrtc_float.cpp ; +run test_fisher_f_cdf_nvrtc_double.cpp ; +run test_fisher_f_cdf_nvrtc_float.cpp ; +run test_fisher_f_pdf_nvrtc_double.cpp ; +run test_fisher_f_pdf_nvrtc_float.cpp ; +run test_fisher_f_quan_nvrtc_double.cpp ; +run test_fisher_f_quan_nvrtc_float.cpp ; + run test_holtsmark_cdf_nvrtc_double.cpp ; run test_holtsmark_cdf_nvrtc_float.cpp ; run test_holtsmark_pdf_nvrtc_double.cpp ; diff --git a/test/test_fisher_f_cdf_nvrtc_double.cpp b/test/test_fisher_f_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..1eb9cb00f --- /dev/null +++ b/test/test_fisher_f_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_fisher_f_cdf_nvrtc_float.cpp b/test/test_fisher_f_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..244190cf1 --- /dev/null +++ b/test/test_fisher_f_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_fisher_f_pdf_nvrtc_double.cpp b/test/test_fisher_f_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..8aa1482aa --- /dev/null +++ b/test/test_fisher_f_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_fisher_f_pdf_nvrtc_float.cpp b/test/test_fisher_f_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..e461dea9a --- /dev/null +++ b/test/test_fisher_f_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_fisher_f_quan_nvrtc_double.cpp b/test/test_fisher_f_quan_nvrtc_double.cpp new file mode 100644 index 000000000..16ad0cbc0 --- /dev/null +++ b/test/test_fisher_f_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_fisher_f_quan_nvrtc_float.cpp b/test/test_fisher_f_quan_nvrtc_float.cpp new file mode 100644 index 000000000..377048e52 --- /dev/null +++ b/test/test_fisher_f_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From d573ab35cf2230f0b80473b6e1a10fbe9b004ec7 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 11:33:56 -0400 Subject: [PATCH 12/31] Add GPU support to gamma dist --- include/boost/math/distributions/gamma.hpp | 84 +++++++++++----------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/include/boost/math/distributions/gamma.hpp b/include/boost/math/distributions/gamma.hpp index 28b7c55b0..5176f906d 100644 --- a/include/boost/math/distributions/gamma.hpp +++ b/include/boost/math/distributions/gamma.hpp @@ -1,4 +1,5 @@ // Copyright John Maddock 2006. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,22 +11,22 @@ // http://mathworld.wolfram.com/GammaDistribution.html // http://en.wikipedia.org/wiki/Gamma_distribution +#include +#include +#include #include #include #include #include #include -#include -#include - namespace boost{ namespace math { namespace detail { template -inline bool check_gamma_shape( +BOOST_MATH_GPU_ENABLED inline bool check_gamma_shape( const char* function, RealType shape, RealType* result, const Policy& pol) @@ -41,7 +42,7 @@ inline bool check_gamma_shape( } template -inline bool check_gamma_x( +BOOST_MATH_GPU_ENABLED inline bool check_gamma_x( const char* function, RealType const& x, RealType* result, const Policy& pol) @@ -57,7 +58,7 @@ inline bool check_gamma_x( } template -inline bool check_gamma( +BOOST_MATH_GPU_ENABLED inline bool check_gamma( const char* function, RealType scale, RealType shape, @@ -75,19 +76,19 @@ class gamma_distribution using value_type = RealType; using policy_type = Policy; - explicit gamma_distribution(RealType l_shape, RealType l_scale = 1) + BOOST_MATH_GPU_ENABLED explicit gamma_distribution(RealType l_shape, RealType l_scale = 1) : m_shape(l_shape), m_scale(l_scale) { RealType result; detail::check_gamma("boost::math::gamma_distribution<%1%>::gamma_distribution", l_scale, l_shape, &result, Policy()); } - RealType shape()const + BOOST_MATH_GPU_ENABLED RealType shape()const { return m_shape; } - RealType scale()const + BOOST_MATH_GPU_ENABLED RealType scale()const { return m_scale; } @@ -109,27 +110,27 @@ gamma_distribution(RealType,RealType)->gamma_distribution -inline std::pair range(const gamma_distribution& /* dist */) +BOOST_MATH_GPU_ENABLED inline boost::math::pair range(const gamma_distribution& /* dist */) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -inline std::pair support(const gamma_distribution& /* dist */) +BOOST_MATH_GPU_ENABLED inline boost::math::pair support(const gamma_distribution& /* dist */) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. using boost::math::tools::max_value; using boost::math::tools::min_value; - return std::pair(min_value(), max_value()); + return boost::math::pair(min_value(), max_value()); } template -inline RealType pdf(const gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType pdf(const gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::pdf(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::pdf(const gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -149,17 +150,17 @@ inline RealType pdf(const gamma_distribution& dist, const Real } // pdf template -inline RealType logpdf(const gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType logpdf(const gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions using boost::math::lgamma; - static const char* function = "boost::math::logpdf(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::logpdf(const gamma_distribution<%1%>&, %1%)"; RealType k = dist.shape(); RealType theta = dist.scale(); - RealType result = -std::numeric_limits::infinity(); + RealType result = -boost::math::numeric_limits::infinity(); if(false == detail::check_gamma(function, theta, k, &result, Policy())) return result; if(false == detail::check_gamma_x(function, x, &result, Policy())) @@ -167,7 +168,7 @@ inline RealType logpdf(const gamma_distribution& dist, const R if(x == 0) { - return std::numeric_limits::quiet_NaN(); + return boost::math::numeric_limits::quiet_NaN(); } result = -k*log(theta) + (k-1)*log(x) - lgamma(k) - (x/theta); @@ -176,11 +177,11 @@ inline RealType logpdf(const gamma_distribution& dist, const R } // logpdf template -inline RealType cdf(const gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::cdf(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -196,11 +197,11 @@ inline RealType cdf(const gamma_distribution& dist, const Real } // cdf template -inline RealType quantile(const gamma_distribution& dist, const RealType& p) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const gamma_distribution& dist, const RealType& p) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -220,11 +221,11 @@ inline RealType quantile(const gamma_distribution& dist, const } template -inline RealType cdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -241,11 +242,11 @@ inline RealType cdf(const complemented2_type -inline RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -266,11 +267,11 @@ inline RealType quantile(const complemented2_type -inline RealType mean(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mean(const gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::mean(const gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::mean(const gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -284,11 +285,11 @@ inline RealType mean(const gamma_distribution& dist) } template -inline RealType variance(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType variance(const gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::variance(const gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::variance(const gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -302,11 +303,11 @@ inline RealType variance(const gamma_distribution& dist) } template -inline RealType mode(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mode(const gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::mode(const gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::mode(const gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -331,11 +332,11 @@ inline RealType mode(const gamma_distribution& dist) //} template -inline RealType skewness(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType skewness(const gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::skewness(const gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::skewness(const gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -349,11 +350,11 @@ inline RealType skewness(const gamma_distribution& dist) } template -inline RealType kurtosis_excess(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::kurtosis_excess(const gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::kurtosis_excess(const gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -367,18 +368,19 @@ inline RealType kurtosis_excess(const gamma_distribution& dist } template -inline RealType kurtosis(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const gamma_distribution& dist) { return kurtosis_excess(dist) + 3; } template -inline RealType entropy(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType entropy(const gamma_distribution& dist) { + BOOST_MATH_STD_USING + RealType k = dist.shape(); RealType theta = dist.scale(); - using std::log; - return k + log(theta) + lgamma(k) + (1-k)*digamma(k); + return k + log(theta) + boost::math::lgamma(k) + (1-k)*digamma(k); } } // namespace math From 6f9c91e18e8bea043f9ef2ccce3b185b018e99bd Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 11:42:41 -0400 Subject: [PATCH 13/31] Add SYCL testing of gamma dist --- test/sycl_jamfile | 1 + test/test_gamma_dist.cpp | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index baf2f95d2..03b130268 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -18,6 +18,7 @@ run test_chi_squared.cpp ; run test_exponential_dist.cpp ; run test_extreme_value.cpp ; run test_fisher_f.cpp ; +run test_gamma_dist.cpp ; run test_holtsmark.cpp ; run test_landau.cpp ; run test_laplace.cpp ; diff --git a/test/test_gamma_dist.cpp b/test/test_gamma_dist.cpp index b7776c79c..2b1a181f3 100644 --- a/test/test_gamma_dist.cpp +++ b/test/test_gamma_dist.cpp @@ -15,16 +15,23 @@ // From MathWorld--A Wolfram Web Resource. // http://mathworld.wolfram.com/GammaDistribution.html +#ifndef SYCL_LANGUAGE_VERSION #include // include directory libs/math/src/tr1/ is needed. +#endif + +#include +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept +#endif + #define BOOST_TEST_MAIN #include // Boost.Test #include #include using boost::math::gamma_distribution; -#include +#include "../include_private/boost/math/tools/test.hpp" #include "test_out_of_range.hpp" #include From 49c0190e6eb459e014426205b5a0715acd34b378 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 11:57:15 -0400 Subject: [PATCH 14/31] Add CUDA gamma dist testing --- test/cuda_jamfile | 7 ++ test/test_gamma_dist_cdf_double.cu | 109 ++++++++++++++++++++++++++++ test/test_gamma_dist_cdf_float.cu | 109 ++++++++++++++++++++++++++++ test/test_gamma_dist_pdf_double.cu | 109 ++++++++++++++++++++++++++++ test/test_gamma_dist_pdf_float.cu | 109 ++++++++++++++++++++++++++++ test/test_gamma_dist_quan_double.cu | 109 ++++++++++++++++++++++++++++ test/test_gamma_dist_quan_float.cu | 109 ++++++++++++++++++++++++++++ 7 files changed, 661 insertions(+) create mode 100644 test/test_gamma_dist_cdf_double.cu create mode 100644 test/test_gamma_dist_cdf_float.cu create mode 100644 test/test_gamma_dist_pdf_double.cu create mode 100644 test/test_gamma_dist_pdf_float.cu create mode 100644 test/test_gamma_dist_quan_double.cu create mode 100644 test/test_gamma_dist_quan_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index f517f4257..b01aa8bb1 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -72,6 +72,13 @@ run test_fisher_f_pdf_float.cu ; run test_fisher_f_quan_double.cu ; run test_fisher_f_quan_float.cu ; +run test_gamma_dist_cdf_double.cu ; +run test_gamma_dist_cdf_float.cu ; +run test_gamma_dist_pdf_double.cu ; +run test_gamma_dist_pdf_float.cu ; +run test_gamma_dist_quan_double.cu ; +run test_gamma_dist_quan_float.cu ; + run test_holtsmark_cdf_double.cu ; run test_holtsmark_cdf_float.cu ; run test_holtsmark_pdf_double.cu ; diff --git a/test/test_gamma_dist_cdf_double.cu b/test/test_gamma_dist_cdf_double.cu new file mode 100644 index 000000000..6424850c3 --- /dev/null +++ b/test/test_gamma_dist_cdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_cdf_float.cu b/test/test_gamma_dist_cdf_float.cu new file mode 100644 index 000000000..4f2312ccc --- /dev/null +++ b/test/test_gamma_dist_cdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_pdf_double.cu b/test/test_gamma_dist_pdf_double.cu new file mode 100644 index 000000000..2f8bbc5f4 --- /dev/null +++ b/test/test_gamma_dist_pdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_pdf_float.cu b/test/test_gamma_dist_pdf_float.cu new file mode 100644 index 000000000..2080f5ccf --- /dev/null +++ b/test/test_gamma_dist_pdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_quan_double.cu b/test/test_gamma_dist_quan_double.cu new file mode 100644 index 000000000..bde18fc36 --- /dev/null +++ b/test/test_gamma_dist_quan_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_quan_float.cu b/test/test_gamma_dist_quan_float.cu new file mode 100644 index 000000000..01ce85dfd --- /dev/null +++ b/test/test_gamma_dist_quan_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file From f8565d7b8b209c853f3dd059bf301caed4a1bb44 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 11:57:27 -0400 Subject: [PATCH 15/31] Add NVRTC gamma dist testing --- test/nvrtc_jamfile | 7 + test/test_gamma_dist_cdf_nvrtc_double.cpp | 191 +++++++++++++++++++++ test/test_gamma_dist_cdf_nvrtc_float.cpp | 191 +++++++++++++++++++++ test/test_gamma_dist_pdf_nvrtc_double.cpp | 191 +++++++++++++++++++++ test/test_gamma_dist_pdf_nvrtc_float.cpp | 191 +++++++++++++++++++++ test/test_gamma_dist_quan_nvrtc_double.cpp | 191 +++++++++++++++++++++ test/test_gamma_dist_quan_nvrtc_float.cpp | 191 +++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_gamma_dist_cdf_nvrtc_double.cpp create mode 100644 test/test_gamma_dist_cdf_nvrtc_float.cpp create mode 100644 test/test_gamma_dist_pdf_nvrtc_double.cpp create mode 100644 test/test_gamma_dist_pdf_nvrtc_float.cpp create mode 100644 test/test_gamma_dist_quan_nvrtc_double.cpp create mode 100644 test/test_gamma_dist_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 438e41c88..94fc6cc9b 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -66,6 +66,13 @@ run test_fisher_f_pdf_nvrtc_float.cpp ; run test_fisher_f_quan_nvrtc_double.cpp ; run test_fisher_f_quan_nvrtc_float.cpp ; +run test_gamma_dist_cdf_nvrtc_double.cpp ; +run test_gamma_dist_cdf_nvrtc_float.cpp ; +run test_gamma_dist_pdf_nvrtc_double.cpp ; +run test_gamma_dist_pdf_nvrtc_float.cpp ; +run test_gamma_dist_quan_nvrtc_double.cpp ; +run test_gamma_dist_quan_nvrtc_float.cpp ; + run test_holtsmark_cdf_nvrtc_double.cpp ; run test_holtsmark_cdf_nvrtc_float.cpp ; run test_holtsmark_pdf_nvrtc_double.cpp ; diff --git a/test/test_gamma_dist_cdf_nvrtc_double.cpp b/test/test_gamma_dist_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..3e911f4e0 --- /dev/null +++ b/test/test_gamma_dist_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist_cdf_nvrtc_float.cpp b/test/test_gamma_dist_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..17762d406 --- /dev/null +++ b/test/test_gamma_dist_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist_pdf_nvrtc_double.cpp b/test/test_gamma_dist_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..1faae9986 --- /dev/null +++ b/test/test_gamma_dist_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist_pdf_nvrtc_float.cpp b/test/test_gamma_dist_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..054ddbbad --- /dev/null +++ b/test/test_gamma_dist_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist_quan_nvrtc_double.cpp b/test/test_gamma_dist_quan_nvrtc_double.cpp new file mode 100644 index 000000000..132efcd6c --- /dev/null +++ b/test/test_gamma_dist_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist_quan_nvrtc_float.cpp b/test/test_gamma_dist_quan_nvrtc_float.cpp new file mode 100644 index 000000000..7749523ab --- /dev/null +++ b/test/test_gamma_dist_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 75e1710bfbbf81b60a929809c9ceba6cb59d4acd Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 11:59:27 -0400 Subject: [PATCH 16/31] Reduce number of threads per block since it can crash CI --- test/test_arcsine_cdf_double.cu | 2 +- test/test_arcsine_cdf_float.cu | 2 +- test/test_arcsine_pdf_double.cu | 2 +- test/test_arcsine_pdf_float.cu | 2 +- test/test_arcsine_quan_double.cu | 2 +- test/test_arcsine_quan_float.cu | 2 +- test/test_arcsine_range_support_double.cu | 2 +- test/test_arcsine_range_support_float.cu | 2 +- test/test_bernoulli_cdf_double.cu | 2 +- test/test_bernoulli_cdf_float.cu | 2 +- test/test_bernoulli_pdf_double.cu | 2 +- test/test_bernoulli_pdf_float.cu | 2 +- test/test_bernoulli_range_support_double.cu | 2 +- test/test_bernoulli_range_support_float.cu | 2 +- test/test_beta_dist_cdf_double.cu | 2 +- test/test_beta_dist_cdf_float.cu | 2 +- test/test_beta_dist_pdf_double.cu | 2 +- test/test_beta_dist_pdf_float.cu | 2 +- test/test_beta_dist_quan_double.cu | 2 +- test/test_beta_dist_quan_float.cu | 2 +- test/test_cauchy_cdf_double.cu | 2 +- test/test_cauchy_cdf_float.cu | 2 +- test/test_cauchy_pdf_double.cu | 2 +- test/test_cauchy_pdf_float.cu | 2 +- test/test_cauchy_quan_double.cu | 2 +- test/test_cauchy_quan_float.cu | 2 +- test/test_cauchy_range_support_double.cu | 2 +- test/test_cauchy_range_support_float.cu | 2 +- test/test_chi_squared_cdf_double.cu | 2 +- test/test_chi_squared_cdf_float.cu | 2 +- test/test_chi_squared_pdf_double.cu | 2 +- test/test_chi_squared_pdf_float.cu | 2 +- test/test_chi_squared_quan_double.cu | 2 +- test/test_chi_squared_quan_float.cu | 2 +- test/test_exponential_cdf_double.cu | 2 +- test/test_exponential_cdf_float.cu | 2 +- test/test_exponential_pdf_double.cu | 2 +- test/test_exponential_pdf_float.cu | 2 +- test/test_exponential_quan_double.cu | 2 +- test/test_exponential_quan_float.cu | 2 +- test/test_exponential_range_support_double.cu | 2 +- test/test_exponential_range_support_float.cu | 2 +- test/test_extreme_value_cdf_double.cu | 2 +- test/test_extreme_value_cdf_float.cu | 2 +- test/test_extreme_value_pdf_double.cu | 2 +- test/test_extreme_value_pdf_float.cu | 2 +- test/test_extreme_value_quan_double.cu | 2 +- test/test_extreme_value_quan_float.cu | 2 +- test/test_fisher_f_cdf_double.cu | 2 +- test/test_fisher_f_cdf_float.cu | 2 +- test/test_fisher_f_pdf_double.cu | 2 +- test/test_fisher_f_pdf_float.cu | 2 +- test/test_fisher_f_quan_double.cu | 2 +- test/test_fisher_f_quan_float.cu | 2 +- test/test_gamma_dist_cdf_double.cu | 2 +- test/test_gamma_dist_cdf_float.cu | 2 +- test/test_gamma_dist_pdf_double.cu | 2 +- test/test_gamma_dist_pdf_float.cu | 2 +- test/test_gamma_dist_quan_double.cu | 2 +- test/test_gamma_dist_quan_float.cu | 2 +- test/test_holtsmark_cdf_double.cu | 2 +- test/test_holtsmark_cdf_float.cu | 2 +- test/test_holtsmark_pdf_double.cu | 2 +- test/test_holtsmark_pdf_float.cu | 2 +- test/test_landau_cdf_double.cu | 2 +- test/test_landau_cdf_float.cu | 2 +- test/test_landau_pdf_double.cu | 2 +- test/test_landau_pdf_float.cu | 2 +- test/test_landau_quan_double.cu | 2 +- test/test_landau_quan_float.cu | 2 +- test/test_laplace_cdf_double.cu | 2 +- test/test_laplace_cdf_float.cu | 2 +- test/test_laplace_pdf_double.cu | 2 +- test/test_laplace_pdf_float.cu | 2 +- test/test_laplace_quan_double.cu | 2 +- test/test_laplace_quan_float.cu | 2 +- test/test_logistic_cdf_double.cu | 2 +- test/test_logistic_cdf_float.cu | 2 +- test/test_logistic_pdf_double.cu | 2 +- test/test_logistic_pdf_float.cu | 2 +- test/test_logistic_quan_double.cu | 2 +- test/test_logistic_quan_float.cu | 2 +- test/test_mapairy_cdf_double.cu | 2 +- test/test_mapairy_cdf_float.cu | 2 +- test/test_mapairy_pdf_double.cu | 2 +- test/test_mapairy_pdf_float.cu | 2 +- test/test_mapairy_quan_double.cu | 2 +- test/test_mapairy_quan_float.cu | 2 +- test/test_saspoint5_cdf_double.cu | 2 +- test/test_saspoint5_cdf_float.cu | 2 +- test/test_saspoint5_pdf_double.cu | 2 +- test/test_saspoint5_pdf_float.cu | 2 +- test/test_saspoint5_quan_double.cu | 2 +- test/test_saspoint5_quan_float.cu | 2 +- test/test_weibull_cdf_double.cu | 2 +- test/test_weibull_cdf_float.cu | 2 +- test/test_weibull_pdf_double.cu | 2 +- test/test_weibull_pdf_float.cu | 2 +- test/test_weibull_quan_double.cu | 2 +- test/test_weibull_quan_float.cu | 2 +- 100 files changed, 100 insertions(+), 100 deletions(-) diff --git a/test/test_arcsine_cdf_double.cu b/test/test_arcsine_cdf_double.cu index d6f6f7b35..3ac9e22cd 100644 --- a/test/test_arcsine_cdf_double.cu +++ b/test/test_arcsine_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_cdf_float.cu b/test/test_arcsine_cdf_float.cu index 148b1dffb..cc73ce95b 100644 --- a/test/test_arcsine_cdf_float.cu +++ b/test/test_arcsine_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_pdf_double.cu b/test/test_arcsine_pdf_double.cu index 7a73bb34e..8f45017ba 100644 --- a/test/test_arcsine_pdf_double.cu +++ b/test/test_arcsine_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_pdf_float.cu b/test/test_arcsine_pdf_float.cu index 54a11253c..c236b7876 100644 --- a/test/test_arcsine_pdf_float.cu +++ b/test/test_arcsine_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_quan_double.cu b/test/test_arcsine_quan_double.cu index 31f6eac8a..a45737063 100644 --- a/test/test_arcsine_quan_double.cu +++ b/test/test_arcsine_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_quan_float.cu b/test/test_arcsine_quan_float.cu index 6decb347b..fd8cd11fc 100644 --- a/test/test_arcsine_quan_float.cu +++ b/test/test_arcsine_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_range_support_double.cu b/test/test_arcsine_range_support_double.cu index cec919a1a..b3fb575fa 100644 --- a/test/test_arcsine_range_support_double.cu +++ b/test/test_arcsine_range_support_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_range_support_float.cu b/test/test_arcsine_range_support_float.cu index d397e0c86..d207d0598 100644 --- a/test/test_arcsine_range_support_float.cu +++ b/test/test_arcsine_range_support_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_cdf_double.cu b/test/test_bernoulli_cdf_double.cu index e4c21ca06..1a6dce645 100644 --- a/test/test_bernoulli_cdf_double.cu +++ b/test/test_bernoulli_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_cdf_float.cu b/test/test_bernoulli_cdf_float.cu index 82c0eabc0..998f24736 100644 --- a/test/test_bernoulli_cdf_float.cu +++ b/test/test_bernoulli_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_pdf_double.cu b/test/test_bernoulli_pdf_double.cu index 24b33c16c..147e2f340 100644 --- a/test/test_bernoulli_pdf_double.cu +++ b/test/test_bernoulli_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_pdf_float.cu b/test/test_bernoulli_pdf_float.cu index 08d2ca5a0..49eaea32f 100644 --- a/test/test_bernoulli_pdf_float.cu +++ b/test/test_bernoulli_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_range_support_double.cu b/test/test_bernoulli_range_support_double.cu index 86c77bd11..ade952fca 100644 --- a/test/test_bernoulli_range_support_double.cu +++ b/test/test_bernoulli_range_support_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_range_support_float.cu b/test/test_bernoulli_range_support_float.cu index cdcf54418..ef276b938 100644 --- a/test/test_bernoulli_range_support_float.cu +++ b/test/test_bernoulli_range_support_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_cdf_double.cu b/test/test_beta_dist_cdf_double.cu index fa460244a..9188f4305 100644 --- a/test/test_beta_dist_cdf_double.cu +++ b/test/test_beta_dist_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_cdf_float.cu b/test/test_beta_dist_cdf_float.cu index 321c84420..0278f6415 100644 --- a/test/test_beta_dist_cdf_float.cu +++ b/test/test_beta_dist_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_pdf_double.cu b/test/test_beta_dist_pdf_double.cu index c0ee9272a..e86cf94dd 100644 --- a/test/test_beta_dist_pdf_double.cu +++ b/test/test_beta_dist_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_pdf_float.cu b/test/test_beta_dist_pdf_float.cu index 75e4fa27b..97dd606f2 100644 --- a/test/test_beta_dist_pdf_float.cu +++ b/test/test_beta_dist_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_quan_double.cu b/test/test_beta_dist_quan_double.cu index 101526afa..a6b842e8e 100644 --- a/test/test_beta_dist_quan_double.cu +++ b/test/test_beta_dist_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_quan_float.cu b/test/test_beta_dist_quan_float.cu index 77696c639..48a860f4c 100644 --- a/test/test_beta_dist_quan_float.cu +++ b/test/test_beta_dist_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_cdf_double.cu b/test/test_cauchy_cdf_double.cu index dc99cbe33..526744ba1 100644 --- a/test/test_cauchy_cdf_double.cu +++ b/test/test_cauchy_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_cdf_float.cu b/test/test_cauchy_cdf_float.cu index dc99cbe33..526744ba1 100644 --- a/test/test_cauchy_cdf_float.cu +++ b/test/test_cauchy_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_pdf_double.cu b/test/test_cauchy_pdf_double.cu index 7a7fe5ba6..62398c31e 100644 --- a/test/test_cauchy_pdf_double.cu +++ b/test/test_cauchy_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_pdf_float.cu b/test/test_cauchy_pdf_float.cu index 5ec3b604b..aff3369b8 100644 --- a/test/test_cauchy_pdf_float.cu +++ b/test/test_cauchy_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_quan_double.cu b/test/test_cauchy_quan_double.cu index 21f4b4dda..0fcaaafe7 100644 --- a/test/test_cauchy_quan_double.cu +++ b/test/test_cauchy_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_quan_float.cu b/test/test_cauchy_quan_float.cu index b6bed1520..9c04c5b12 100644 --- a/test/test_cauchy_quan_float.cu +++ b/test/test_cauchy_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_range_support_double.cu b/test/test_cauchy_range_support_double.cu index 4ec792ce3..3a42c1bd3 100644 --- a/test/test_cauchy_range_support_double.cu +++ b/test/test_cauchy_range_support_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_range_support_float.cu b/test/test_cauchy_range_support_float.cu index 1cdd90e40..e713736e6 100644 --- a/test/test_cauchy_range_support_float.cu +++ b/test/test_cauchy_range_support_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_cdf_double.cu b/test/test_chi_squared_cdf_double.cu index 1b0c34ce6..c2475883b 100644 --- a/test/test_chi_squared_cdf_double.cu +++ b/test/test_chi_squared_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_cdf_float.cu b/test/test_chi_squared_cdf_float.cu index 8ca99ed2e..07dce0d06 100644 --- a/test/test_chi_squared_cdf_float.cu +++ b/test/test_chi_squared_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_pdf_double.cu b/test/test_chi_squared_pdf_double.cu index ed45246d3..30edafd05 100644 --- a/test/test_chi_squared_pdf_double.cu +++ b/test/test_chi_squared_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_pdf_float.cu b/test/test_chi_squared_pdf_float.cu index 5a0f97db9..9b205182b 100644 --- a/test/test_chi_squared_pdf_float.cu +++ b/test/test_chi_squared_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_quan_double.cu b/test/test_chi_squared_quan_double.cu index 3b7dad972..3fae7d966 100644 --- a/test/test_chi_squared_quan_double.cu +++ b/test/test_chi_squared_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_quan_float.cu b/test/test_chi_squared_quan_float.cu index 3e779a090..7a717530e 100644 --- a/test/test_chi_squared_quan_float.cu +++ b/test/test_chi_squared_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_cdf_double.cu b/test/test_exponential_cdf_double.cu index 8601d1c08..e3a57e86e 100644 --- a/test/test_exponential_cdf_double.cu +++ b/test/test_exponential_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_cdf_float.cu b/test/test_exponential_cdf_float.cu index aa5ef9153..ed214a495 100644 --- a/test/test_exponential_cdf_float.cu +++ b/test/test_exponential_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_pdf_double.cu b/test/test_exponential_pdf_double.cu index 9a5615f1b..530b1023b 100644 --- a/test/test_exponential_pdf_double.cu +++ b/test/test_exponential_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_pdf_float.cu b/test/test_exponential_pdf_float.cu index f15ee3ea8..0801e2d0b 100644 --- a/test/test_exponential_pdf_float.cu +++ b/test/test_exponential_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_quan_double.cu b/test/test_exponential_quan_double.cu index ea5a5a681..f4eb4c3b1 100644 --- a/test/test_exponential_quan_double.cu +++ b/test/test_exponential_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_quan_float.cu b/test/test_exponential_quan_float.cu index ea5a5a681..f4eb4c3b1 100644 --- a/test/test_exponential_quan_float.cu +++ b/test/test_exponential_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_range_support_double.cu b/test/test_exponential_range_support_double.cu index eec3981d2..c19497ed5 100644 --- a/test/test_exponential_range_support_double.cu +++ b/test/test_exponential_range_support_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_range_support_float.cu b/test/test_exponential_range_support_float.cu index 00f443e52..a111090de 100644 --- a/test/test_exponential_range_support_float.cu +++ b/test/test_exponential_range_support_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_cdf_double.cu b/test/test_extreme_value_cdf_double.cu index 8f7f366b3..7ca000348 100644 --- a/test/test_extreme_value_cdf_double.cu +++ b/test/test_extreme_value_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_cdf_float.cu b/test/test_extreme_value_cdf_float.cu index d1b6cc762..bc3ead6eb 100644 --- a/test/test_extreme_value_cdf_float.cu +++ b/test/test_extreme_value_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_pdf_double.cu b/test/test_extreme_value_pdf_double.cu index 4cf3fc2d0..44ccc5b71 100644 --- a/test/test_extreme_value_pdf_double.cu +++ b/test/test_extreme_value_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_pdf_float.cu b/test/test_extreme_value_pdf_float.cu index c0c5da7ee..390622f40 100644 --- a/test/test_extreme_value_pdf_float.cu +++ b/test/test_extreme_value_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_quan_double.cu b/test/test_extreme_value_quan_double.cu index 703d2054f..41f2f69a6 100644 --- a/test/test_extreme_value_quan_double.cu +++ b/test/test_extreme_value_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_quan_float.cu b/test/test_extreme_value_quan_float.cu index 25d982cd0..5fe16e9a8 100644 --- a/test/test_extreme_value_quan_float.cu +++ b/test/test_extreme_value_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_fisher_f_cdf_double.cu b/test/test_fisher_f_cdf_double.cu index 877961166..c6d6f0a94 100644 --- a/test/test_fisher_f_cdf_double.cu +++ b/test/test_fisher_f_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_fisher_f_cdf_float.cu b/test/test_fisher_f_cdf_float.cu index a6fcc9f98..9df1bc869 100644 --- a/test/test_fisher_f_cdf_float.cu +++ b/test/test_fisher_f_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_fisher_f_pdf_double.cu b/test/test_fisher_f_pdf_double.cu index e4ae50791..77a3b655a 100644 --- a/test/test_fisher_f_pdf_double.cu +++ b/test/test_fisher_f_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_fisher_f_pdf_float.cu b/test/test_fisher_f_pdf_float.cu index 7b7583736..323edf342 100644 --- a/test/test_fisher_f_pdf_float.cu +++ b/test/test_fisher_f_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_fisher_f_quan_double.cu b/test/test_fisher_f_quan_double.cu index 42bcb0dac..c16eb2a95 100644 --- a/test/test_fisher_f_quan_double.cu +++ b/test/test_fisher_f_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_fisher_f_quan_float.cu b/test/test_fisher_f_quan_float.cu index 3a0bc688b..85cf47967 100644 --- a/test/test_fisher_f_quan_float.cu +++ b/test/test_fisher_f_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_gamma_dist_cdf_double.cu b/test/test_gamma_dist_cdf_double.cu index 6424850c3..4777196aa 100644 --- a/test/test_gamma_dist_cdf_double.cu +++ b/test/test_gamma_dist_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_gamma_dist_cdf_float.cu b/test/test_gamma_dist_cdf_float.cu index 4f2312ccc..a93aca395 100644 --- a/test/test_gamma_dist_cdf_float.cu +++ b/test/test_gamma_dist_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_gamma_dist_pdf_double.cu b/test/test_gamma_dist_pdf_double.cu index 2f8bbc5f4..a8411d5b6 100644 --- a/test/test_gamma_dist_pdf_double.cu +++ b/test/test_gamma_dist_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_gamma_dist_pdf_float.cu b/test/test_gamma_dist_pdf_float.cu index 2080f5ccf..6ab3247ac 100644 --- a/test/test_gamma_dist_pdf_float.cu +++ b/test/test_gamma_dist_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_gamma_dist_quan_double.cu b/test/test_gamma_dist_quan_double.cu index bde18fc36..d29bf6d6b 100644 --- a/test/test_gamma_dist_quan_double.cu +++ b/test/test_gamma_dist_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_gamma_dist_quan_float.cu b/test/test_gamma_dist_quan_float.cu index 01ce85dfd..58aa42e90 100644 --- a/test/test_gamma_dist_quan_float.cu +++ b/test/test_gamma_dist_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_holtsmark_cdf_double.cu b/test/test_holtsmark_cdf_double.cu index 5a02b7ddb..6b1d57041 100644 --- a/test/test_holtsmark_cdf_double.cu +++ b/test/test_holtsmark_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_holtsmark_cdf_float.cu b/test/test_holtsmark_cdf_float.cu index 71ae21f73..2a3533bac 100644 --- a/test/test_holtsmark_cdf_float.cu +++ b/test/test_holtsmark_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_holtsmark_pdf_double.cu b/test/test_holtsmark_pdf_double.cu index b883515a7..a53360d20 100644 --- a/test/test_holtsmark_pdf_double.cu +++ b/test/test_holtsmark_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_holtsmark_pdf_float.cu b/test/test_holtsmark_pdf_float.cu index c56815973..57052803f 100644 --- a/test/test_holtsmark_pdf_float.cu +++ b/test/test_holtsmark_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_cdf_double.cu b/test/test_landau_cdf_double.cu index 092fff00e..40bff707d 100644 --- a/test/test_landau_cdf_double.cu +++ b/test/test_landau_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_cdf_float.cu b/test/test_landau_cdf_float.cu index 143755aff..c4513c084 100644 --- a/test/test_landau_cdf_float.cu +++ b/test/test_landau_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_pdf_double.cu b/test/test_landau_pdf_double.cu index eea6f87ad..6ce3f5f78 100644 --- a/test/test_landau_pdf_double.cu +++ b/test/test_landau_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_pdf_float.cu b/test/test_landau_pdf_float.cu index a424bdd67..5818ddf8a 100644 --- a/test/test_landau_pdf_float.cu +++ b/test/test_landau_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_quan_double.cu b/test/test_landau_quan_double.cu index 8cdf12588..4995bd49c 100644 --- a/test/test_landau_quan_double.cu +++ b/test/test_landau_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_quan_float.cu b/test/test_landau_quan_float.cu index 8cdf12588..4995bd49c 100644 --- a/test/test_landau_quan_float.cu +++ b/test/test_landau_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_cdf_double.cu b/test/test_laplace_cdf_double.cu index cddcfa2bc..ec3c83ecd 100644 --- a/test/test_laplace_cdf_double.cu +++ b/test/test_laplace_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_cdf_float.cu b/test/test_laplace_cdf_float.cu index 2af43f9f5..96acea2fd 100644 --- a/test/test_laplace_cdf_float.cu +++ b/test/test_laplace_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_pdf_double.cu b/test/test_laplace_pdf_double.cu index 2f53c0dd1..568be622b 100644 --- a/test/test_laplace_pdf_double.cu +++ b/test/test_laplace_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_pdf_float.cu b/test/test_laplace_pdf_float.cu index a8d673dba..cb2aa67c1 100644 --- a/test/test_laplace_pdf_float.cu +++ b/test/test_laplace_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_quan_double.cu b/test/test_laplace_quan_double.cu index cddcfa2bc..ec3c83ecd 100644 --- a/test/test_laplace_quan_double.cu +++ b/test/test_laplace_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_quan_float.cu b/test/test_laplace_quan_float.cu index 2af43f9f5..96acea2fd 100644 --- a/test/test_laplace_quan_float.cu +++ b/test/test_laplace_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_cdf_double.cu b/test/test_logistic_cdf_double.cu index 5dd3723c5..6b4e85025 100644 --- a/test/test_logistic_cdf_double.cu +++ b/test/test_logistic_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_cdf_float.cu b/test/test_logistic_cdf_float.cu index 89d05747b..75b6ab0af 100644 --- a/test/test_logistic_cdf_float.cu +++ b/test/test_logistic_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_pdf_double.cu b/test/test_logistic_pdf_double.cu index 39aaa1597..90232a2d6 100644 --- a/test/test_logistic_pdf_double.cu +++ b/test/test_logistic_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_pdf_float.cu b/test/test_logistic_pdf_float.cu index 279112b99..0a99ff9cf 100644 --- a/test/test_logistic_pdf_float.cu +++ b/test/test_logistic_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_quan_double.cu b/test/test_logistic_quan_double.cu index ad929d442..afe8a4c8c 100644 --- a/test/test_logistic_quan_double.cu +++ b/test/test_logistic_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_quan_float.cu b/test/test_logistic_quan_float.cu index 81c22ea4b..92c371062 100644 --- a/test/test_logistic_quan_float.cu +++ b/test/test_logistic_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_cdf_double.cu b/test/test_mapairy_cdf_double.cu index 1494181bf..7cb62a934 100644 --- a/test/test_mapairy_cdf_double.cu +++ b/test/test_mapairy_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_cdf_float.cu b/test/test_mapairy_cdf_float.cu index 41dd4615a..b67c0ee93 100644 --- a/test/test_mapairy_cdf_float.cu +++ b/test/test_mapairy_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_pdf_double.cu b/test/test_mapairy_pdf_double.cu index ad3abfbee..4ccd8b2f2 100644 --- a/test/test_mapairy_pdf_double.cu +++ b/test/test_mapairy_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_pdf_float.cu b/test/test_mapairy_pdf_float.cu index cabee4a2f..520ac9a68 100644 --- a/test/test_mapairy_pdf_float.cu +++ b/test/test_mapairy_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_quan_double.cu b/test/test_mapairy_quan_double.cu index fe6265eff..378700020 100644 --- a/test/test_mapairy_quan_double.cu +++ b/test/test_mapairy_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_quan_float.cu b/test/test_mapairy_quan_float.cu index ad2f6b5eb..cd9d12007 100644 --- a/test/test_mapairy_quan_float.cu +++ b/test/test_mapairy_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_cdf_double.cu b/test/test_saspoint5_cdf_double.cu index 745ca2bf8..fb3e2f74c 100644 --- a/test/test_saspoint5_cdf_double.cu +++ b/test/test_saspoint5_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_cdf_float.cu b/test/test_saspoint5_cdf_float.cu index 51bc2e870..325a470bb 100644 --- a/test/test_saspoint5_cdf_float.cu +++ b/test/test_saspoint5_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_pdf_double.cu b/test/test_saspoint5_pdf_double.cu index 948a09260..5392a328b 100644 --- a/test/test_saspoint5_pdf_double.cu +++ b/test/test_saspoint5_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_pdf_float.cu b/test/test_saspoint5_pdf_float.cu index 4980e9070..01fbcd472 100644 --- a/test/test_saspoint5_pdf_float.cu +++ b/test/test_saspoint5_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_quan_double.cu b/test/test_saspoint5_quan_double.cu index 764c27899..7415f0690 100644 --- a/test/test_saspoint5_quan_double.cu +++ b/test/test_saspoint5_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_quan_float.cu b/test/test_saspoint5_quan_float.cu index a65958fb8..d6f49084b 100644 --- a/test/test_saspoint5_quan_float.cu +++ b/test/test_saspoint5_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_cdf_double.cu b/test/test_weibull_cdf_double.cu index 65efbe252..1b2e5cf0d 100644 --- a/test/test_weibull_cdf_double.cu +++ b/test/test_weibull_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_cdf_float.cu b/test/test_weibull_cdf_float.cu index 65c3ce1ff..76bf3a4e1 100644 --- a/test/test_weibull_cdf_float.cu +++ b/test/test_weibull_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_pdf_double.cu b/test/test_weibull_pdf_double.cu index 645df4c0a..dd48b57d6 100644 --- a/test/test_weibull_pdf_double.cu +++ b/test/test_weibull_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_pdf_float.cu b/test/test_weibull_pdf_float.cu index f1e6917f0..40064b1ed 100644 --- a/test/test_weibull_pdf_float.cu +++ b/test/test_weibull_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_quan_double.cu b/test/test_weibull_quan_double.cu index 2f0500602..9263fb536 100644 --- a/test/test_weibull_quan_double.cu +++ b/test/test_weibull_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_quan_float.cu b/test/test_weibull_quan_float.cu index 3027e14dd..5dd6bd6ee 100644 --- a/test/test_weibull_quan_float.cu +++ b/test/test_weibull_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; From f9b91acf6354b0965625b23d8bfa3d0128e2ed8f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 13:39:34 -0400 Subject: [PATCH 17/31] Add GPU support to the geometric dist --- .../boost/math/distributions/geometric.hpp | 91 +++++++++---------- 1 file changed, 44 insertions(+), 47 deletions(-) diff --git a/include/boost/math/distributions/geometric.hpp b/include/boost/math/distributions/geometric.hpp index 7c511ef2d..8aa78ddc9 100644 --- a/include/boost/math/distributions/geometric.hpp +++ b/include/boost/math/distributions/geometric.hpp @@ -36,6 +36,9 @@ #ifndef BOOST_MATH_SPECIAL_GEOMETRIC_HPP #define BOOST_MATH_SPECIAL_GEOMETRIC_HPP +#include +#include +#include #include #include // for ibeta(a, b, x) == Ix(a, b). #include // complement. @@ -45,10 +48,6 @@ #include #include -#include // using std::numeric_limits; -#include -#include - #if defined (BOOST_MSVC) # pragma warning(push) // This believed not now necessary, so commented out. @@ -64,7 +63,7 @@ namespace boost { // Common error checking routines for geometric distribution function: template - inline bool check_success_fraction(const char* function, const RealType& p, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_success_fraction(const char* function, const RealType& p, RealType* result, const Policy& pol) { if( !(boost::math::isfinite)(p) || (p < 0) || (p > 1) ) { @@ -77,13 +76,13 @@ namespace boost } template - inline bool check_dist(const char* function, const RealType& p, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist(const char* function, const RealType& p, RealType* result, const Policy& pol) { return check_success_fraction(function, p, result, pol); } template - inline bool check_dist_and_k(const char* function, const RealType& p, RealType k, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist_and_k(const char* function, const RealType& p, RealType k, RealType* result, const Policy& pol) { if(check_dist(function, p, result, pol) == false) { @@ -100,7 +99,7 @@ namespace boost } // Check_dist_and_k template - inline bool check_dist_and_prob(const char* function, RealType p, RealType prob, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist_and_prob(const char* function, RealType p, RealType prob, RealType* result, const Policy& pol) { if((check_dist(function, p, result, pol) && detail::check_probability(function, prob, result, pol)) == false) { @@ -117,7 +116,7 @@ namespace boost typedef RealType value_type; typedef Policy policy_type; - geometric_distribution(RealType p) : m_p(p) + BOOST_MATH_GPU_ENABLED geometric_distribution(RealType p) : m_p(p) { // Constructor stores success_fraction p. RealType result; geometric_detail::check_dist( @@ -127,22 +126,22 @@ namespace boost } // geometric_distribution constructor. // Private data getter class member functions. - RealType success_fraction() const + BOOST_MATH_GPU_ENABLED RealType success_fraction() const { // Probability of success as fraction in range 0 to 1. return m_p; } - RealType successes() const + BOOST_MATH_GPU_ENABLED RealType successes() const { // Total number of successes r = 1 (for compatibility with negative binomial?). return 1; } // Parameter estimation. // (These are copies of negative_binomial distribution with successes = 1). - static RealType find_lower_bound_on_p( + BOOST_MATH_GPU_ENABLED static RealType find_lower_bound_on_p( RealType trials, RealType alpha) // alpha 0.05 equivalent to 95% for one-sided test. { - static const char* function = "boost::math::geometric<%1%>::find_lower_bound_on_p"; + constexpr auto function = "boost::math::geometric<%1%>::find_lower_bound_on_p"; RealType result = 0; // of error checks. RealType successes = 1; RealType failures = trials - successes; @@ -163,11 +162,11 @@ namespace boost return ibeta_inv(successes, failures + 1, alpha, static_cast(nullptr), Policy()); } // find_lower_bound_on_p - static RealType find_upper_bound_on_p( + BOOST_MATH_GPU_ENABLED static RealType find_upper_bound_on_p( RealType trials, RealType alpha) // alpha 0.05 equivalent to 95% for one-sided test. { - static const char* function = "boost::math::geometric<%1%>::find_upper_bound_on_p"; + constexpr auto function = "boost::math::geometric<%1%>::find_upper_bound_on_p"; RealType result = 0; // of error checks. RealType successes = 1; RealType failures = trials - successes; @@ -195,12 +194,12 @@ namespace boost // Estimate number of trials : // "How many trials do I need to be P% sure of seeing k or fewer failures?" - static RealType find_minimum_number_of_trials( + BOOST_MATH_GPU_ENABLED static RealType find_minimum_number_of_trials( RealType k, // number of failures (k >= 0). RealType p, // success fraction 0 <= p <= 1. RealType alpha) // risk level threshold 0 <= alpha <= 1. { - static const char* function = "boost::math::geometric<%1%>::find_minimum_number_of_trials"; + constexpr auto function = "boost::math::geometric<%1%>::find_minimum_number_of_trials"; // Error checks: RealType result = 0; if(false == geometric_detail::check_dist_and_k( @@ -218,7 +217,7 @@ namespace boost RealType p, // success fraction 0 <= p <= 1. RealType alpha) // risk level threshold 0 <= alpha <= 1. { - static const char* function = "boost::math::geometric<%1%>::find_maximum_number_of_trials"; + constexpr auto function = "boost::math::geometric<%1%>::find_maximum_number_of_trials"; // Error checks: RealType result = 0; if(false == geometric_detail::check_dist_and_k( @@ -244,22 +243,22 @@ namespace boost #endif template - inline const std::pair range(const geometric_distribution& /* dist */) + BOOST_MATH_GPU_ENABLED inline const boost::math::pair range(const geometric_distribution& /* dist */) { // Range of permissible values for random variable k. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); // max_integer? + return boost::math::pair(static_cast(0), max_value()); // max_integer? } template - inline const std::pair support(const geometric_distribution& /* dist */) + BOOST_MATH_GPU_ENABLED inline const boost::math::pair support(const geometric_distribution& /* dist */) { // Range of supported values for random variable k. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); // max_integer? + return boost::math::pair(static_cast(0), max_value()); // max_integer? } template - inline RealType mean(const geometric_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType mean(const geometric_distribution& dist) { // Mean of geometric distribution = (1-p)/p. return (1 - dist.success_fraction() ) / dist.success_fraction(); } // mean @@ -267,21 +266,21 @@ namespace boost // median implemented via quantile(half) in derived accessors. template - inline RealType mode(const geometric_distribution&) + BOOST_MATH_GPU_ENABLED inline RealType mode(const geometric_distribution&) { // Mode of geometric distribution = zero. BOOST_MATH_STD_USING // ADL of std functions. return 0; } // mode template - inline RealType variance(const geometric_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType variance(const geometric_distribution& dist) { // Variance of Binomial distribution = (1-p) / p^2. return (1 - dist.success_fraction()) / (dist.success_fraction() * dist.success_fraction()); } // variance template - inline RealType skewness(const geometric_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType skewness(const geometric_distribution& dist) { // skewness of geometric distribution = 2-p / (sqrt(r(1-p)) BOOST_MATH_STD_USING // ADL of std functions. RealType p = dist.success_fraction(); @@ -289,7 +288,7 @@ namespace boost } // skewness template - inline RealType kurtosis(const geometric_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const geometric_distribution& dist) { // kurtosis of geometric distribution // http://en.wikipedia.org/wiki/geometric is kurtosis_excess so add 3 RealType p = dist.success_fraction(); @@ -297,7 +296,7 @@ namespace boost } // kurtosis template - inline RealType kurtosis_excess(const geometric_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const geometric_distribution& dist) { // kurtosis excess of geometric distribution // http://mathworld.wolfram.com/Kurtosis.html table of kurtosis_excess RealType p = dist.success_fraction(); @@ -312,11 +311,11 @@ namespace boost // chf of geometric distribution provided by derived accessors. template - inline RealType pdf(const geometric_distribution& dist, const RealType& k) + BOOST_MATH_GPU_ENABLED inline RealType pdf(const geometric_distribution& dist, const RealType& k) { // Probability Density/Mass Function. BOOST_FPU_EXCEPTION_GUARD BOOST_MATH_STD_USING // For ADL of math functions. - static const char* function = "boost::math::pdf(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::pdf(const geometric_distribution<%1%>&, %1%)"; RealType p = dist.success_fraction(); RealType result = 0; @@ -350,9 +349,9 @@ namespace boost } // geometric_pdf template - inline RealType cdf(const geometric_distribution& dist, const RealType& k) + BOOST_MATH_GPU_ENABLED inline RealType cdf(const geometric_distribution& dist, const RealType& k) { // Cumulative Distribution Function of geometric. - static const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)"; // k argument may be integral, signed, or unsigned, or floating point. // If necessary, it has already been promoted from an integral type. @@ -381,12 +380,10 @@ namespace boost } // cdf Cumulative Distribution Function geometric. template - inline RealType logcdf(const geometric_distribution& dist, const RealType& k) + BOOST_MATH_GPU_ENABLED inline RealType logcdf(const geometric_distribution& dist, const RealType& k) { // Cumulative Distribution Function of geometric. - using std::pow; - using std::log; - using std::exp; - static const char* function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)"; + BOOST_MATH_STD_USING + constexpr auto function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)"; // k argument may be integral, signed, or unsigned, or floating point. // If necessary, it has already been promoted from an integral type. @@ -399,7 +396,7 @@ namespace boost k, &result, Policy())) { - return -std::numeric_limits::infinity(); + return -boost::math::numeric_limits::infinity(); } if(k == 0) { @@ -413,10 +410,10 @@ namespace boost } // logcdf Cumulative Distribution Function geometric. template - inline RealType cdf(const complemented2_type, RealType>& c) + BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { // Complemented Cumulative Distribution Function geometric. BOOST_MATH_STD_USING - static const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)"; // k argument may be integral, signed, or unsigned, or floating point. // If necessary, it has already been promoted from an integral type. RealType const& k = c.param; @@ -438,10 +435,10 @@ namespace boost } // cdf Complemented Cumulative Distribution Function geometric. template - inline RealType logcdf(const complemented2_type, RealType>& c) + BOOST_MATH_GPU_ENABLED inline RealType logcdf(const complemented2_type, RealType>& c) { // Complemented Cumulative Distribution Function geometric. BOOST_MATH_STD_USING - static const char* function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)"; // k argument may be integral, signed, or unsigned, or floating point. // If necessary, it has already been promoted from an integral type. RealType const& k = c.param; @@ -455,21 +452,21 @@ namespace boost k, &result, Policy())) { - return -std::numeric_limits::infinity(); + return -boost::math::numeric_limits::infinity(); } return boost::math::log1p(-p, Policy()) * (k+1); } // logcdf Complemented Cumulative Distribution Function geometric. template - inline RealType quantile(const geometric_distribution& dist, const RealType& x) + BOOST_MATH_GPU_ENABLED inline RealType quantile(const geometric_distribution& dist, const RealType& x) { // Quantile, percentile/100 or Percent Point geometric function. // Return the number of expected failures k for a given probability p. // Inverse cumulative Distribution Function or Quantile (percentile / 100) of geometric Probability. // k argument may be integral, signed, or unsigned, or floating point. - static const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)"; BOOST_MATH_STD_USING // ADL of std functions. RealType success_fraction = dist.success_fraction(); @@ -513,11 +510,11 @@ namespace boost } // RealType quantile(const geometric_distribution dist, p) template - inline RealType quantile(const complemented2_type, RealType>& c) + BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { // Quantile or Percent Point Binomial function. // Return the number of expected failures k for a given // complement of the probability Q = 1 - P. - static const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)"; BOOST_MATH_STD_USING // Error checks: RealType x = c.param; From 91cffdafb694327508351dfe366e6cca742f77eb Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 13:39:49 -0400 Subject: [PATCH 18/31] Add SYCL testing of geometric dist --- test/sycl_jamfile | 1 + test/test_geometric.cpp | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 03b130268..c9527a8c3 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -19,6 +19,7 @@ run test_exponential_dist.cpp ; run test_extreme_value.cpp ; run test_fisher_f.cpp ; run test_gamma_dist.cpp ; +run test_geometric.cpp ; run test_holtsmark.cpp ; run test_landau.cpp ; run test_laplace.cpp ; diff --git a/test/test_geometric.cpp b/test/test_geometric.cpp index 928a2aa0e..13a9e090b 100644 --- a/test/test_geometric.cpp +++ b/test/test_geometric.cpp @@ -26,9 +26,14 @@ # define TEST_REAL_CONCEPT #endif -#include +#include + +#include "../include_private/boost/math/tools/test.hpp" + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept using ::boost::math::concepts::real_concept; +#endif #include // for geometric_distribution using boost::math::geometric_distribution; @@ -64,7 +69,11 @@ void test_spot( // Test a single spot value against 'known good' values. RealType tol, // Test tolerance RealType logtol) // Logcdf Test tolerance. { - BOOST_IF_CONSTEXPR (std::is_same::value || std::is_same::value) + BOOST_IF_CONSTEXPR (std::is_same::value + #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS + || std::is_same::value + #endif + ) { logtol *= 100; } @@ -376,7 +385,9 @@ if(std::numeric_limits::is_specialized) static_cast(9.9000000000003448e-201L), // 100 * tolerance); // Note difference - // p nearer unity. + // p nearer unity. + // On GPU this gets flushed to 0 which has an eps difference of 3.4e+38 + #ifndef BOOST_MATH_HAS_GPU_SUPPORT BOOST_CHECK_CLOSE_FRACTION( // pdf(geometric_distribution(static_cast(0.9999)), static_cast(10) ), // Number of failures, k @@ -384,6 +395,7 @@ if(std::numeric_limits::is_specialized) // static_cast(1.00156406e-040) static_cast(9.999e-41), // exact from 100 digit calculator. 2e3 * tolerance); // Note bigger tolerance needed. + #endif // Moshier Cephes 100 digits calculator says 9.999e-41 //0.9999*pow(1-0.9999,10) From e6518f5bac271a130a8ca6cf0ca2e35009ab1fdc Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 13:59:25 -0400 Subject: [PATCH 19/31] Add cuda::std::tie --- include/boost/math/tools/tuple.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/boost/math/tools/tuple.hpp b/include/boost/math/tools/tuple.hpp index 82d23b8d7..dcc763e37 100644 --- a/include/boost/math/tools/tuple.hpp +++ b/include/boost/math/tools/tuple.hpp @@ -23,6 +23,7 @@ using cuda::std::tuple; using cuda::std::make_pair; +using cuda::std::tie; using cuda::std::get; using cuda::std::tuple_size; From 4609f25f2709fddd20078992a06ce8972bddf32d Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 13:59:40 -0400 Subject: [PATCH 20/31] Add GPU support to inv_discrete_quantile --- .../detail/inv_discrete_quantile.hpp | 71 ++++++++++--------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/include/boost/math/distributions/detail/inv_discrete_quantile.hpp b/include/boost/math/distributions/detail/inv_discrete_quantile.hpp index 739a86666..f688345b7 100644 --- a/include/boost/math/distributions/detail/inv_discrete_quantile.hpp +++ b/include/boost/math/distributions/detail/inv_discrete_quantile.hpp @@ -6,7 +6,11 @@ #ifndef BOOST_MATH_DISTRIBUTIONS_DETAIL_INV_DISCRETE_QUANTILE #define BOOST_MATH_DISTRIBUTIONS_DETAIL_INV_DISCRETE_QUANTILE -#include +#include +#include +#include +#include +#include namespace boost{ namespace math{ namespace detail{ @@ -19,10 +23,10 @@ struct distribution_quantile_finder typedef typename Dist::value_type value_type; typedef typename Dist::policy_type policy_type; - distribution_quantile_finder(const Dist d, value_type p, bool c) + BOOST_MATH_GPU_ENABLED distribution_quantile_finder(const Dist d, value_type p, bool c) : dist(d), target(p), comp(c) {} - value_type operator()(value_type const& x) + BOOST_MATH_GPU_ENABLED value_type operator()(value_type const& x) { return comp ? value_type(target - cdf(complement(dist, x))) : value_type(cdf(dist, x) - target); } @@ -42,24 +46,24 @@ struct distribution_quantile_finder // in the root no longer being bracketed. // template -void adjust_bounds(Real& /* a */, Real& /* b */, Tol const& /* tol */){} +BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& /* a */, Real& /* b */, Tol const& /* tol */){} template -void adjust_bounds(Real& /* a */, Real& b, tools::equal_floor const& /* tol */) +BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& /* a */, Real& b, tools::equal_floor const& /* tol */) { BOOST_MATH_STD_USING b -= tools::epsilon() * b; } template -void adjust_bounds(Real& a, Real& /* b */, tools::equal_ceil const& /* tol */) +BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& a, Real& /* b */, tools::equal_ceil const& /* tol */) { BOOST_MATH_STD_USING a += tools::epsilon() * a; } template -void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol */) +BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol */) { BOOST_MATH_STD_USING a += tools::epsilon() * a; @@ -69,7 +73,7 @@ void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol // This is where all the work is done: // template -typename Dist::value_type +BOOST_MATH_GPU_ENABLED typename Dist::value_type do_inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -78,7 +82,7 @@ typename Dist::value_type const typename Dist::value_type& multiplier, typename Dist::value_type adder, const Tolerance& tol, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { typedef typename Dist::value_type value_type; typedef typename Dist::policy_type policy_type; @@ -100,7 +104,7 @@ typename Dist::value_type guess = min_bound; value_type fa = f(guess); - std::uintmax_t count = max_iter - 1; + boost::math::uintmax_t count = max_iter - 1; value_type fb(fa), a(guess), b =0; // Compiler warning C4701: potentially uninitialized local variable 'b' used if(fa == 0) @@ -130,7 +134,7 @@ typename Dist::value_type else { b = a; - a = (std::max)(value_type(b - 1), value_type(0)); + a = BOOST_MATH_GPU_SAFE_MAX(value_type(b - 1), value_type(0)); if(a < min_bound) a = min_bound; fa = f(a); @@ -153,7 +157,7 @@ typename Dist::value_type // If we're looking for a large result, then bump "adder" up // by a bit to increase our chances of bracketing the root: // - //adder = (std::max)(adder, 0.001f * guess); + //adder = BOOST_MATH_GPU_SAFE_MAX(adder, 0.001f * guess); if(fa < 0) { b = a + adder; @@ -162,7 +166,7 @@ typename Dist::value_type } else { - b = (std::max)(value_type(a - adder), value_type(0)); + b = BOOST_MATH_GPU_SAFE_MAX(value_type(a - adder), value_type(0)); if(b < min_bound) b = min_bound; } @@ -186,7 +190,7 @@ typename Dist::value_type } else { - b = (std::max)(value_type(a - adder), value_type(0)); + b = BOOST_MATH_GPU_SAFE_MAX(value_type(a - adder), value_type(0)); if(b < min_bound) b = min_bound; } @@ -195,9 +199,8 @@ typename Dist::value_type } if(a > b) { - using std::swap; - swap(a, b); - swap(fa, fb); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(fa, fb); } } // @@ -274,7 +277,7 @@ typename Dist::value_type // // Go ahead and find the root: // - std::pair r = toms748_solve(f, a, b, fa, fb, tol, count, policy_type()); + boost::math::pair r = toms748_solve(f, a, b, fa, fb, tol, count, policy_type()); max_iter += count; if (max_iter >= policies::get_max_root_iterations()) { @@ -293,7 +296,7 @@ typename Dist::value_type // is very close 1. // template -inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c) +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c) { BOOST_MATH_STD_USING typename Dist::value_type cc = ceil(result); @@ -325,7 +328,7 @@ inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::va #endif template -inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c) +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c) { BOOST_MATH_STD_USING typename Dist::value_type cc = floor(result); @@ -339,7 +342,11 @@ inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::val // while(true) { + #ifdef BOOST_MATH_HAS_GPU_SUPPORT + cc = ceil(nextafter(result, tools::max_value())); + #else cc = ceil(float_next(result)); + #endif if(cc > support(d).second) break; pp = c ? cdf(complement(d, cc)) : cdf(d, cc); @@ -362,7 +369,7 @@ inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::val // to an int where required. // template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, typename Dist::value_type p, @@ -371,7 +378,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { if(p > 0.5) { @@ -393,7 +400,7 @@ inline typename Dist::value_type } template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -402,7 +409,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { typedef typename Dist::value_type value_type; BOOST_MATH_STD_USING @@ -436,7 +443,7 @@ inline typename Dist::value_type } template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -445,7 +452,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { typedef typename Dist::value_type value_type; BOOST_MATH_STD_USING @@ -479,7 +486,7 @@ inline typename Dist::value_type } template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -488,7 +495,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { typedef typename Dist::value_type value_type; BOOST_MATH_STD_USING @@ -507,7 +514,7 @@ inline typename Dist::value_type } template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -516,7 +523,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { BOOST_MATH_STD_USING typename Dist::value_type pp = c ? 1 - p : p; @@ -534,7 +541,7 @@ inline typename Dist::value_type } template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -543,7 +550,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { typedef typename Dist::value_type value_type; BOOST_MATH_STD_USING From ed0b3a088956c27c94d64d94b74835c09ff9d64b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 13:59:52 -0400 Subject: [PATCH 21/31] Add CUDA testing of geometric dist --- test/cuda_jamfile | 7 ++ test/test_geometric_dist_cdf_double.cu | 109 ++++++++++++++++++++++++ test/test_geometric_dist_cdf_float.cu | 109 ++++++++++++++++++++++++ test/test_geometric_dist_pdf_double.cu | 109 ++++++++++++++++++++++++ test/test_geometric_dist_pdf_float.cu | 109 ++++++++++++++++++++++++ test/test_geometric_dist_quan_double.cu | 109 ++++++++++++++++++++++++ test/test_geometric_dist_quan_float.cu | 109 ++++++++++++++++++++++++ 7 files changed, 661 insertions(+) create mode 100644 test/test_geometric_dist_cdf_double.cu create mode 100644 test/test_geometric_dist_cdf_float.cu create mode 100644 test/test_geometric_dist_pdf_double.cu create mode 100644 test/test_geometric_dist_pdf_float.cu create mode 100644 test/test_geometric_dist_quan_double.cu create mode 100644 test/test_geometric_dist_quan_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index b01aa8bb1..57a16f2c7 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -79,6 +79,13 @@ run test_gamma_dist_pdf_float.cu ; run test_gamma_dist_quan_double.cu ; run test_gamma_dist_quan_float.cu ; +run test_geometric_dist_cdf_double.cu ; +run test_geometric_dist_cdf_float.cu ; +run test_geometric_dist_pdf_double.cu ; +run test_geometric_dist_pdf_float.cu ; +run test_geometric_dist_quan_double.cu ; +run test_geometric_dist_quan_float.cu ; + run test_holtsmark_cdf_double.cu ; run test_holtsmark_cdf_float.cu ; run test_holtsmark_pdf_double.cu ; diff --git a/test/test_geometric_dist_cdf_double.cu b/test/test_geometric_dist_cdf_double.cu new file mode 100644 index 000000000..98b6510ad --- /dev/null +++ b/test/test_geometric_dist_cdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_cdf_float.cu b/test/test_geometric_dist_cdf_float.cu new file mode 100644 index 000000000..2662ac07c --- /dev/null +++ b/test/test_geometric_dist_cdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_pdf_double.cu b/test/test_geometric_dist_pdf_double.cu new file mode 100644 index 000000000..03d2dc007 --- /dev/null +++ b/test/test_geometric_dist_pdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_pdf_float.cu b/test/test_geometric_dist_pdf_float.cu new file mode 100644 index 000000000..1034d122b --- /dev/null +++ b/test/test_geometric_dist_pdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_quan_double.cu b/test/test_geometric_dist_quan_double.cu new file mode 100644 index 000000000..fcac938e5 --- /dev/null +++ b/test/test_geometric_dist_quan_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_quan_float.cu b/test/test_geometric_dist_quan_float.cu new file mode 100644 index 000000000..89d8bea47 --- /dev/null +++ b/test/test_geometric_dist_quan_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file From bf31592c135856a4b8a6e60d93fd5ef6c68415e6 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 14:16:01 -0400 Subject: [PATCH 22/31] Add NVRTC testing of geometric dist --- .../boost/math/distributions/geometric.hpp | 2 +- test/nvrtc_jamfile | 7 + test/test_geometric_dist_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_geometric_dist_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_geometric_dist_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_geometric_dist_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++ .../test_geometric_dist_quan_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_geometric_dist_quan_nvrtc_float.cpp | 191 ++++++++++++++++++ 8 files changed, 1154 insertions(+), 1 deletion(-) create mode 100644 test/test_geometric_dist_cdf_nvrtc_double.cpp create mode 100644 test/test_geometric_dist_cdf_nvrtc_float.cpp create mode 100644 test/test_geometric_dist_pdf_nvrtc_double.cpp create mode 100644 test/test_geometric_dist_pdf_nvrtc_float.cpp create mode 100644 test/test_geometric_dist_quan_nvrtc_double.cpp create mode 100644 test/test_geometric_dist_quan_nvrtc_float.cpp diff --git a/include/boost/math/distributions/geometric.hpp b/include/boost/math/distributions/geometric.hpp index 8aa78ddc9..0a7b383c2 100644 --- a/include/boost/math/distributions/geometric.hpp +++ b/include/boost/math/distributions/geometric.hpp @@ -212,7 +212,7 @@ namespace boost return result + k; } // RealType find_number_of_failures - static RealType find_maximum_number_of_trials( + BOOST_MATH_GPU_ENABLED static RealType find_maximum_number_of_trials( RealType k, // number of failures (k >= 0). RealType p, // success fraction 0 <= p <= 1. RealType alpha) // risk level threshold 0 <= alpha <= 1. diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 94fc6cc9b..cf3fe89e9 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -73,6 +73,13 @@ run test_gamma_dist_pdf_nvrtc_float.cpp ; run test_gamma_dist_quan_nvrtc_double.cpp ; run test_gamma_dist_quan_nvrtc_float.cpp ; +run test_geometric_dist_cdf_nvrtc_double.cpp ; +run test_geometric_dist_cdf_nvrtc_float.cpp ; +run test_geometric_dist_pdf_nvrtc_double.cpp ; +run test_geometric_dist_pdf_nvrtc_float.cpp ; +run test_geometric_dist_quan_nvrtc_double.cpp ; +run test_geometric_dist_quan_nvrtc_float.cpp ; + run test_holtsmark_cdf_nvrtc_double.cpp ; run test_holtsmark_cdf_nvrtc_float.cpp ; run test_holtsmark_pdf_nvrtc_double.cpp ; diff --git a/test/test_geometric_dist_cdf_nvrtc_double.cpp b/test/test_geometric_dist_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..f8c5ed5aa --- /dev/null +++ b/test/test_geometric_dist_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric_dist_cdf_nvrtc_float.cpp b/test/test_geometric_dist_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..a53cd0d97 --- /dev/null +++ b/test/test_geometric_dist_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric_dist_pdf_nvrtc_double.cpp b/test/test_geometric_dist_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..8a6b5756e --- /dev/null +++ b/test/test_geometric_dist_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric_dist_pdf_nvrtc_float.cpp b/test/test_geometric_dist_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..dfb05105d --- /dev/null +++ b/test/test_geometric_dist_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric_dist_quan_nvrtc_double.cpp b/test/test_geometric_dist_quan_nvrtc_double.cpp new file mode 100644 index 000000000..52b2e97ec --- /dev/null +++ b/test/test_geometric_dist_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric_dist_quan_nvrtc_float.cpp b/test/test_geometric_dist_quan_nvrtc_float.cpp new file mode 100644 index 000000000..a83cf857e --- /dev/null +++ b/test/test_geometric_dist_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From ca4bb46d25e7376614721606bdfcdc843f6c442b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 14:29:32 -0400 Subject: [PATCH 23/31] Add SYCL testing of inverse_chi_squared dist --- .../distributions/inverse_chi_squared.hpp | 68 +++++++++---------- test/sycl_jamfile | 1 + .../test_inverse_chi_squared_distribution.cpp | 7 +- 3 files changed, 40 insertions(+), 36 deletions(-) diff --git a/include/boost/math/distributions/inverse_chi_squared.hpp b/include/boost/math/distributions/inverse_chi_squared.hpp index 19dd0371e..1a3c680d2 100644 --- a/include/boost/math/distributions/inverse_chi_squared.hpp +++ b/include/boost/math/distributions/inverse_chi_squared.hpp @@ -1,6 +1,6 @@ // Copyright John Maddock 2010. // Copyright Paul A. Bristow 2010. - +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt @@ -9,6 +9,8 @@ #ifndef BOOST_MATH_DISTRIBUTIONS_INVERSE_CHI_SQUARED_HPP #define BOOST_MATH_DISTRIBUTIONS_INVERSE_CHI_SQUARED_HPP +#include +#include #include #include // for incomplete beta. #include // for complements. @@ -24,14 +26,12 @@ // Weisstein, Eric W. "Inverse Chi-Squared Distribution." From MathWorld--A Wolfram Web Resource. // http://mathworld.wolfram.com/InverseChi-SquaredDistribution.html -#include - namespace boost{ namespace math{ namespace detail { template - inline bool check_inverse_chi_squared( // Check both distribution parameters. + BOOST_MATH_GPU_ENABLED inline bool check_inverse_chi_squared( // Check both distribution parameters. const char* function, RealType degrees_of_freedom, // degrees_of_freedom (aka nu). RealType scale, // scale (aka sigma^2) @@ -51,7 +51,7 @@ class inverse_chi_squared_distribution typedef RealType value_type; typedef Policy policy_type; - inverse_chi_squared_distribution(RealType df, RealType l_scale) : m_df(df), m_scale (l_scale) + BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df, RealType l_scale) : m_df(df), m_scale (l_scale) { RealType result; detail::check_df( @@ -62,7 +62,7 @@ class inverse_chi_squared_distribution m_scale, &result, Policy()); } // inverse_chi_squared_distribution constructor - inverse_chi_squared_distribution(RealType df = 1) : m_df(df) + BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1) : m_df(df) { RealType result; m_scale = 1 / m_df ; // Default scale = 1 / degrees of freedom (Wikipedia definition 1). @@ -71,11 +71,11 @@ class inverse_chi_squared_distribution m_df, &result, Policy()); } // inverse_chi_squared_distribution - RealType degrees_of_freedom()const + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const { return m_df; // aka nu } - RealType scale()const + BOOST_MATH_GPU_ENABLED RealType scale()const { return m_scale; // aka xi } @@ -105,28 +105,28 @@ inverse_chi_squared_distribution(RealType,RealType)->inverse_chi_squared_distrib #endif template -inline const std::pair range(const inverse_chi_squared_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline const boost::math::pair range(const inverse_chi_squared_distribution& /*dist*/) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); // 0 to + infinity. + return boost::math::pair(static_cast(0), max_value()); // 0 to + infinity. } template -inline const std::pair support(const inverse_chi_squared_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline const boost::math::pair support(const inverse_chi_squared_distribution& /*dist*/) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. - return std::pair(static_cast(0), tools::max_value()); // 0 to + infinity. + return boost::math::pair(static_cast(0), tools::max_value()); // 0 to + infinity. } template -RealType pdf(const inverse_chi_squared_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED RealType pdf(const inverse_chi_squared_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions. RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); RealType error_result; - static const char* function = "boost::math::pdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::pdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; if(false == detail::check_inverse_chi_squared (function, df, scale, &error_result, Policy()) @@ -159,9 +159,9 @@ RealType pdf(const inverse_chi_squared_distribution& dist, con } // pdf template -inline RealType cdf(const inverse_chi_squared_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const inverse_chi_squared_distribution& dist, const RealType& x) { - static const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); RealType error_result; @@ -188,13 +188,13 @@ inline RealType cdf(const inverse_chi_squared_distribution& di } // cdf template -inline RealType quantile(const inverse_chi_squared_distribution& dist, const RealType& p) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const inverse_chi_squared_distribution& dist, const RealType& p) { using boost::math::gamma_q_inv; RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); - static const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)"; // Error check: RealType error_result; if(false == detail::check_df( @@ -220,13 +220,13 @@ inline RealType quantile(const inverse_chi_squared_distribution -inline RealType cdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { using boost::math::gamma_q_inv; RealType const& df = c.dist.degrees_of_freedom(); RealType const& scale = c.dist.scale(); RealType const& x = c.param; - static const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; // Error check: RealType error_result; if(false == detail::check_df( @@ -251,14 +251,14 @@ inline RealType cdf(const complemented2_type -inline RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { using boost::math::gamma_q_inv; RealType const& df = c.dist.degrees_of_freedom(); RealType const& scale = c.dist.scale(); RealType const& q = c.param; - static const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)"; // Error check: RealType error_result; if(false == detail::check_df(function, df, &error_result, Policy())) @@ -280,12 +280,12 @@ inline RealType quantile(const complemented2_type -inline RealType mean(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mean(const inverse_chi_squared_distribution& dist) { // Mean of inverse Chi-Squared distribution. RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); - static const char* function = "boost::math::mean(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::mean(const inverse_chi_squared_distribution<%1%>&)"; if(df <= 2) return policies::raise_domain_error( function, @@ -295,11 +295,11 @@ inline RealType mean(const inverse_chi_squared_distribution& d } // mean template -inline RealType variance(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType variance(const inverse_chi_squared_distribution& dist) { // Variance of inverse Chi-Squared distribution. RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); - static const char* function = "boost::math::variance(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::variance(const inverse_chi_squared_distribution<%1%>&)"; if(df <= 4) { return policies::raise_domain_error( @@ -311,14 +311,14 @@ inline RealType variance(const inverse_chi_squared_distribution -inline RealType mode(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mode(const inverse_chi_squared_distribution& dist) { // mode is not defined in Mathematica. // See Discussion section http://en.wikipedia.org/wiki/Talk:Scaled-inverse-chi-square_distribution // for origin of the formula used below. RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); - static const char* function = "boost::math::mode(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::mode(const inverse_chi_squared_distribution<%1%>&)"; if(df < 0) return policies::raise_domain_error( function, @@ -341,11 +341,11 @@ inline RealType mode(const inverse_chi_squared_distribution& d // Now implemented via quantile(half) in derived accessors. template -inline RealType skewness(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType skewness(const inverse_chi_squared_distribution& dist) { BOOST_MATH_STD_USING // For ADL RealType df = dist.degrees_of_freedom(); - static const char* function = "boost::math::skewness(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::skewness(const inverse_chi_squared_distribution<%1%>&)"; if(df <= 6) return policies::raise_domain_error( function, @@ -356,10 +356,10 @@ inline RealType skewness(const inverse_chi_squared_distribution -inline RealType kurtosis(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const inverse_chi_squared_distribution& dist) { RealType df = dist.degrees_of_freedom(); - static const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)"; if(df <= 8) return policies::raise_domain_error( function, @@ -370,10 +370,10 @@ inline RealType kurtosis(const inverse_chi_squared_distribution -inline RealType kurtosis_excess(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const inverse_chi_squared_distribution& dist) { RealType df = dist.degrees_of_freedom(); - static const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)"; if(df <= 8) return policies::raise_domain_error( function, diff --git a/test/sycl_jamfile b/test/sycl_jamfile index c9527a8c3..e90dc0e70 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -21,6 +21,7 @@ run test_fisher_f.cpp ; run test_gamma_dist.cpp ; run test_geometric.cpp ; run test_holtsmark.cpp ; +run test_inverse_chi_squared_distribution.cpp ; run test_landau.cpp ; run test_laplace.cpp ; run test_logistic_dist.cpp ; diff --git a/test/test_inverse_chi_squared_distribution.cpp b/test/test_inverse_chi_squared_distribution.cpp index a69782418..cbc9dcf19 100644 --- a/test/test_inverse_chi_squared_distribution.cpp +++ b/test/test_inverse_chi_squared_distribution.cpp @@ -14,11 +14,14 @@ // http://www.wolframalpha.com/input/?i=inverse+chisquare+distribution -#include +#include +#include "../include_private/boost/math/tools/test.hpp" + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept using ::boost::math::concepts::real_concept; +#endif -//#include #define BOOST_TEST_MAIN #include // for test_main #include // for BOOST_CHECK_CLOSE_FRACTION From 5a7e304409fb87dc907360e03c011fe35cd70784 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 14:36:44 -0400 Subject: [PATCH 24/31] Adjust tol --- test/test_geometric_dist_quan_float.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_geometric_dist_quan_float.cu b/test/test_geometric_dist_quan_float.cu index 89d8bea47..074952202 100644 --- a/test/test_geometric_dist_quan_float.cu +++ b/test/test_geometric_dist_quan_float.cu @@ -90,7 +90,7 @@ int main(void) // check the results for(int i = 0; i < numElements; ++i) { - if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200.0) + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000.0) { std::cerr << "Result verification failed at element " << i << "!" << std::endl; std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; From 8dd1e81f0e0ccb6e373fdd15e2f4749b874e34f4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 14:44:29 -0400 Subject: [PATCH 25/31] Add NVRTC inverse chi squared dist testing --- test/nvrtc_jamfile | 7 + ...t_inverse_chi_squared_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++ ...st_inverse_chi_squared_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++ ...t_inverse_chi_squared_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++ ...st_inverse_chi_squared_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++ ..._inverse_chi_squared_quan_nvrtc_double.cpp | 191 ++++++++++++++++++ ...t_inverse_chi_squared_quan_nvrtc_float.cpp | 191 ++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_inverse_chi_squared_cdf_nvrtc_double.cpp create mode 100644 test/test_inverse_chi_squared_cdf_nvrtc_float.cpp create mode 100644 test/test_inverse_chi_squared_pdf_nvrtc_double.cpp create mode 100644 test/test_inverse_chi_squared_pdf_nvrtc_float.cpp create mode 100644 test/test_inverse_chi_squared_quan_nvrtc_double.cpp create mode 100644 test/test_inverse_chi_squared_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index cf3fe89e9..0834086d4 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -87,6 +87,13 @@ run test_holtsmark_pdf_nvrtc_float.cpp ; run test_holtsmark_quan_nvrtc_double.cpp ; run test_holtsmark_quan_nvrtc_float.cpp ; +run test_inverse_chi_squared_cdf_nvrtc_double.cpp ; +run test_inverse_chi_squared_cdf_nvrtc_float.cpp ; +run test_inverse_chi_squared_pdf_nvrtc_double.cpp ; +run test_inverse_chi_squared_pdf_nvrtc_float.cpp ; +run test_inverse_chi_squared_quan_nvrtc_double.cpp ; +run test_inverse_chi_squared_quan_nvrtc_float.cpp ; + run test_landau_cdf_nvrtc_double.cpp ; run test_landau_cdf_nvrtc_float.cpp ; run test_landau_pdf_nvrtc_double.cpp ; diff --git a/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp b/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..b221aedaa --- /dev/null +++ b/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp b/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..743654c14 --- /dev/null +++ b/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp b/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..4608b3bd6 --- /dev/null +++ b/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp b/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..8b4db55c0 --- /dev/null +++ b/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_chi_squared_quan_nvrtc_double.cpp b/test/test_inverse_chi_squared_quan_nvrtc_double.cpp new file mode 100644 index 000000000..0f8a9a5f8 --- /dev/null +++ b/test/test_inverse_chi_squared_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_chi_squared_quan_nvrtc_float.cpp b/test/test_inverse_chi_squared_quan_nvrtc_float.cpp new file mode 100644 index 000000000..ab494a8da --- /dev/null +++ b/test/test_inverse_chi_squared_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From bab0138bfcb08b9513d9c58eced6134a2268b802 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 14:53:29 -0400 Subject: [PATCH 26/31] Add CUDA inverse chi squared dist testing --- test/cuda_jamfile | 7 ++ test/test_inverse_chi_squared_cdf_double.cu | 110 +++++++++++++++++++ test/test_inverse_chi_squared_cdf_float.cu | 110 +++++++++++++++++++ test/test_inverse_chi_squared_pdf_double.cu | 110 +++++++++++++++++++ test/test_inverse_chi_squared_pdf_float.cu | 110 +++++++++++++++++++ test/test_inverse_chi_squared_quan_double.cu | 110 +++++++++++++++++++ test/test_inverse_chi_squared_quan_float.cu | 110 +++++++++++++++++++ 7 files changed, 667 insertions(+) create mode 100644 test/test_inverse_chi_squared_cdf_double.cu create mode 100644 test/test_inverse_chi_squared_cdf_float.cu create mode 100644 test/test_inverse_chi_squared_pdf_double.cu create mode 100644 test/test_inverse_chi_squared_pdf_float.cu create mode 100644 test/test_inverse_chi_squared_quan_double.cu create mode 100644 test/test_inverse_chi_squared_quan_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 57a16f2c7..283267593 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -91,6 +91,13 @@ run test_holtsmark_cdf_float.cu ; run test_holtsmark_pdf_double.cu ; run test_holtsmark_pdf_float.cu ; +run test_inverse_chi_squared_cdf_double.cu ; +run test_inverse_chi_squared_cdf_float.cu ; +run test_inverse_chi_squared_pdf_double.cu ; +run test_inverse_chi_squared_pdf_float.cu ; +run test_inverse_chi_squared_quan_double.cu ; +run test_inverse_chi_squared_quan_float.cu ; + run test_landau_cdf_double.cu ; run test_landau_cdf_float.cu ; run test_landau_pdf_double.cu ; diff --git a/test/test_inverse_chi_squared_cdf_double.cu b/test/test_inverse_chi_squared_cdf_double.cu new file mode 100644 index 000000000..9703e7a3a --- /dev/null +++ b/test/test_inverse_chi_squared_cdf_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_cdf_float.cu b/test/test_inverse_chi_squared_cdf_float.cu new file mode 100644 index 000000000..bb56a4872 --- /dev/null +++ b/test/test_inverse_chi_squared_cdf_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_pdf_double.cu b/test/test_inverse_chi_squared_pdf_double.cu new file mode 100644 index 000000000..f30611749 --- /dev/null +++ b/test/test_inverse_chi_squared_pdf_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_pdf_float.cu b/test/test_inverse_chi_squared_pdf_float.cu new file mode 100644 index 000000000..8a3d1c1ef --- /dev/null +++ b/test/test_inverse_chi_squared_pdf_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_quan_double.cu b/test/test_inverse_chi_squared_quan_double.cu new file mode 100644 index 000000000..f9022c6a3 --- /dev/null +++ b/test/test_inverse_chi_squared_quan_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_quan_float.cu b/test/test_inverse_chi_squared_quan_float.cu new file mode 100644 index 000000000..10aa6d707 --- /dev/null +++ b/test/test_inverse_chi_squared_quan_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} From 62ac8cda1cab598bfd17a9a225d9ab4837d44a17 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 15:29:51 -0400 Subject: [PATCH 27/31] Add GPU support to inverse gamma dist --- .../math/distributions/inverse_gamma.hpp | 77 ++++++++++--------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/include/boost/math/distributions/inverse_gamma.hpp b/include/boost/math/distributions/inverse_gamma.hpp index 8c9e4763d..6aa798ed8 100644 --- a/include/boost/math/distributions/inverse_gamma.hpp +++ b/include/boost/math/distributions/inverse_gamma.hpp @@ -2,6 +2,7 @@ // Copyright Paul A. Bristow 2010. // Copyright John Maddock 2010. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -22,21 +23,21 @@ // http://mathworld.wolfram.com/GammaDistribution.html // http://en.wikipedia.org/wiki/Gamma_distribution +#include +#include +#include #include #include #include #include -#include -#include - namespace boost{ namespace math { namespace detail { template -inline bool check_inverse_gamma_shape( +BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma_shape( const char* function, // inverse_gamma RealType shape, // shape aka alpha RealType* result, // to update, perhaps with NaN @@ -57,7 +58,7 @@ inline bool check_inverse_gamma_shape( } //bool check_inverse_gamma_shape template -inline bool check_inverse_gamma_x( +BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma_x( const char* function, RealType const& x, RealType* result, const Policy& pol) @@ -73,7 +74,7 @@ inline bool check_inverse_gamma_x( } template -inline bool check_inverse_gamma( +BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma( const char* function, // TODO swap these over, so shape is first. RealType scale, // scale aka beta RealType shape, // shape aka alpha @@ -92,7 +93,7 @@ class inverse_gamma_distribution using value_type = RealType; using policy_type = Policy; - explicit inverse_gamma_distribution(RealType l_shape = 1, RealType l_scale = 1) + BOOST_MATH_GPU_ENABLED explicit inverse_gamma_distribution(RealType l_shape = 1, RealType l_scale = 1) : m_shape(l_shape), m_scale(l_scale) { RealType result; @@ -101,12 +102,12 @@ class inverse_gamma_distribution l_scale, l_shape, &result, Policy()); } - RealType shape()const + BOOST_MATH_GPU_ENABLED RealType shape()const { return m_shape; } - RealType scale()const + BOOST_MATH_GPU_ENABLED RealType scale()const { return m_scale; } @@ -132,27 +133,27 @@ inverse_gamma_distribution(RealType,RealType)->inverse_gamma_distribution -inline std::pair range(const inverse_gamma_distribution& /* dist */) +BOOST_MATH_GPU_ENABLED inline boost::math::pair range(const inverse_gamma_distribution& /* dist */) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -inline std::pair support(const inverse_gamma_distribution& /* dist */) +BOOST_MATH_GPU_ENABLED inline boost::math::pair support(const inverse_gamma_distribution& /* dist */) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. using boost::math::tools::max_value; using boost::math::tools::min_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -inline RealType pdf(const inverse_gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType pdf(const inverse_gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::pdf(const inverse_gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::pdf(const inverse_gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -195,17 +196,17 @@ inline RealType pdf(const inverse_gamma_distribution& dist, co } // pdf template -inline RealType logpdf(const inverse_gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType logpdf(const inverse_gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions using boost::math::lgamma; - static const char* function = "boost::math::logpdf(const inverse_gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::logpdf(const inverse_gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); - RealType result = -std::numeric_limits::infinity(); + RealType result = -boost::math::numeric_limits::infinity(); if(false == detail::check_inverse_gamma(function, scale, shape, &result, Policy())) { // distribution parameters bad. return result; @@ -232,11 +233,11 @@ inline RealType logpdf(const inverse_gamma_distribution& dist, } // pdf template -inline RealType cdf(const inverse_gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const inverse_gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::cdf(const inverse_gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const inverse_gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -260,12 +261,12 @@ inline RealType cdf(const inverse_gamma_distribution& dist, co } // cdf template -inline RealType quantile(const inverse_gamma_distribution& dist, const RealType& p) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const inverse_gamma_distribution& dist, const RealType& p) { BOOST_MATH_STD_USING // for ADL of std functions using boost::math::gamma_q_inv; - static const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -287,11 +288,11 @@ inline RealType quantile(const inverse_gamma_distribution& dis } template -inline RealType cdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -310,11 +311,11 @@ inline RealType cdf(const complemented2_type -inline RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -338,11 +339,11 @@ inline RealType quantile(const complemented2_type -inline RealType mean(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mean(const inverse_gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::mean(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::mean(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -365,11 +366,11 @@ inline RealType mean(const inverse_gamma_distribution& dist) } // mean template -inline RealType variance(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType variance(const inverse_gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::variance(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::variance(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -391,11 +392,11 @@ inline RealType variance(const inverse_gamma_distribution& dis } template -inline RealType mode(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mode(const inverse_gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::mode(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::mode(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -418,11 +419,11 @@ inline RealType mode(const inverse_gamma_distribution& dist) //} template -inline RealType skewness(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType skewness(const inverse_gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::skewness(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::skewness(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -444,11 +445,11 @@ inline RealType skewness(const inverse_gamma_distribution& dis } template -inline RealType kurtosis_excess(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const inverse_gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::kurtosis_excess(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::kurtosis_excess(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -470,9 +471,9 @@ inline RealType kurtosis_excess(const inverse_gamma_distribution -inline RealType kurtosis(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const inverse_gamma_distribution& dist) { - static const char* function = "boost::math::kurtosis(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::kurtosis(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); From 2f05b01088cfe7f18be9874c59b1f9c61d4ef5b3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 15:30:05 -0400 Subject: [PATCH 28/31] Add SYCL testing to inverse gamma dist --- test/sycl_jamfile | 1 + test/test_inverse_gamma_distribution.cpp | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/sycl_jamfile b/test/sycl_jamfile index e90dc0e70..bcdfb7e5d 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -22,6 +22,7 @@ run test_gamma_dist.cpp ; run test_geometric.cpp ; run test_holtsmark.cpp ; run test_inverse_chi_squared_distribution.cpp ; +run test_inverse_gamma_distribution.cpp ; run test_landau.cpp ; run test_laplace.cpp ; run test_logistic_dist.cpp ; diff --git a/test/test_inverse_gamma_distribution.cpp b/test/test_inverse_gamma_distribution.cpp index 68b238fbc..436131d83 100644 --- a/test/test_inverse_gamma_distribution.cpp +++ b/test/test_inverse_gamma_distribution.cpp @@ -14,11 +14,14 @@ # pragma warning (disable : 4310) // cast truncates constant value #endif -#include +#include +#include "../include_private/boost/math/tools/test.hpp" + +#ifndef BOOST_MATH_HAS_GPU_SUPPORT #include // for real_concept using ::boost::math::concepts::real_concept; +#endif -//#include #define BOOST_TEST_MAIN #include // for test_main #include // for BOOST_CHECK_CLOSE_FRACTION From c95d73ceaac5ad6838ab68ce9b0e02b67fc0a07f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 15:43:00 -0400 Subject: [PATCH 29/31] Add NVRTC testing of inverse gamma dist --- test/nvrtc_jamfile | 7 + test/test_inverse_gamma_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_quan_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_quan_nvrtc_float.cpp | 191 ++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_inverse_gamma_cdf_nvrtc_double.cpp create mode 100644 test/test_inverse_gamma_cdf_nvrtc_float.cpp create mode 100644 test/test_inverse_gamma_pdf_nvrtc_double.cpp create mode 100644 test/test_inverse_gamma_pdf_nvrtc_float.cpp create mode 100644 test/test_inverse_gamma_quan_nvrtc_double.cpp create mode 100644 test/test_inverse_gamma_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 0834086d4..1b001eb2f 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -94,6 +94,13 @@ run test_inverse_chi_squared_pdf_nvrtc_float.cpp ; run test_inverse_chi_squared_quan_nvrtc_double.cpp ; run test_inverse_chi_squared_quan_nvrtc_float.cpp ; +run test_inverse_gamma_cdf_nvrtc_double.cpp ; +run test_inverse_gamma_cdf_nvrtc_float.cpp ; +run test_inverse_gamma_pdf_nvrtc_double.cpp ; +run test_inverse_gamma_pdf_nvrtc_float.cpp ; +run test_inverse_gamma_quan_nvrtc_double.cpp ; +run test_inverse_gamma_quan_nvrtc_float.cpp ; + run test_landau_cdf_nvrtc_double.cpp ; run test_landau_cdf_nvrtc_float.cpp ; run test_landau_pdf_nvrtc_double.cpp ; diff --git a/test/test_inverse_gamma_cdf_nvrtc_double.cpp b/test/test_inverse_gamma_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..c5a4b9878 --- /dev/null +++ b/test/test_inverse_gamma_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_cdf_nvrtc_float.cpp b/test/test_inverse_gamma_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..d76d51225 --- /dev/null +++ b/test/test_inverse_gamma_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_pdf_nvrtc_double.cpp b/test/test_inverse_gamma_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..db2c8c4e1 --- /dev/null +++ b/test/test_inverse_gamma_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_pdf_nvrtc_float.cpp b/test/test_inverse_gamma_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..4d552cf61 --- /dev/null +++ b/test/test_inverse_gamma_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_quan_nvrtc_double.cpp b/test/test_inverse_gamma_quan_nvrtc_double.cpp new file mode 100644 index 000000000..a49600bde --- /dev/null +++ b/test/test_inverse_gamma_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_quan_nvrtc_float.cpp b/test/test_inverse_gamma_quan_nvrtc_float.cpp new file mode 100644 index 000000000..f71ed964a --- /dev/null +++ b/test/test_inverse_gamma_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 44f82e1a39c029b30d4e742d7df89e3de68d05a9 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 15:52:54 -0400 Subject: [PATCH 30/31] Add CUDA testing of inverse gamma dist --- test/cuda_jamfile | 7 ++ test/test_inverse_gamma_cdf_double.cu | 110 +++++++++++++++++++++++++ test/test_inverse_gamma_cdf_float.cu | 110 +++++++++++++++++++++++++ test/test_inverse_gamma_pdf_double.cu | 110 +++++++++++++++++++++++++ test/test_inverse_gamma_pdf_float.cu | 110 +++++++++++++++++++++++++ test/test_inverse_gamma_quan_double.cu | 110 +++++++++++++++++++++++++ test/test_inverse_gamma_quan_float.cu | 110 +++++++++++++++++++++++++ 7 files changed, 667 insertions(+) create mode 100644 test/test_inverse_gamma_cdf_double.cu create mode 100644 test/test_inverse_gamma_cdf_float.cu create mode 100644 test/test_inverse_gamma_pdf_double.cu create mode 100644 test/test_inverse_gamma_pdf_float.cu create mode 100644 test/test_inverse_gamma_quan_double.cu create mode 100644 test/test_inverse_gamma_quan_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 283267593..796d14a49 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -98,6 +98,13 @@ run test_inverse_chi_squared_pdf_float.cu ; run test_inverse_chi_squared_quan_double.cu ; run test_inverse_chi_squared_quan_float.cu ; +run test_inverse_gamma_cdf_double.cu ; +run test_inverse_gamma_cdf_float.cu ; +run test_inverse_gamma_pdf_double.cu ; +run test_inverse_gamma_pdf_float.cu ; +run test_inverse_gamma_quan_double.cu ; +run test_inverse_gamma_quan_float.cu ; + run test_landau_cdf_double.cu ; run test_landau_cdf_float.cu ; run test_landau_pdf_double.cu ; diff --git a/test/test_inverse_gamma_cdf_double.cu b/test/test_inverse_gamma_cdf_double.cu new file mode 100644 index 000000000..4368a2284 --- /dev/null +++ b/test/test_inverse_gamma_cdf_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_cdf_float.cu b/test/test_inverse_gamma_cdf_float.cu new file mode 100644 index 000000000..cef2ec955 --- /dev/null +++ b/test/test_inverse_gamma_cdf_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_pdf_double.cu b/test/test_inverse_gamma_pdf_double.cu new file mode 100644 index 000000000..fa5073dbe --- /dev/null +++ b/test/test_inverse_gamma_pdf_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_pdf_float.cu b/test/test_inverse_gamma_pdf_float.cu new file mode 100644 index 000000000..c2d80fe8d --- /dev/null +++ b/test/test_inverse_gamma_pdf_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_quan_double.cu b/test/test_inverse_gamma_quan_double.cu new file mode 100644 index 000000000..c9095d752 --- /dev/null +++ b/test/test_inverse_gamma_quan_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_quan_float.cu b/test/test_inverse_gamma_quan_float.cu new file mode 100644 index 000000000..3e60feaa1 --- /dev/null +++ b/test/test_inverse_gamma_quan_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} From 89e1707f4ddf86b6b9c5fa8e214a6fa5a2d542bf Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 15:57:34 -0400 Subject: [PATCH 31/31] Update docs --- doc/distributions/fisher.qbk | 14 ++++++++------ doc/distributions/gamma.qbk | 14 ++++++++------ doc/distributions/geometric.qbk | 16 +++++++++------- doc/distributions/inverse_chi_squared.qbk | 18 ++++++++++-------- doc/distributions/inverse_gamma.qbk | 14 ++++++++------ 5 files changed, 43 insertions(+), 33 deletions(-) diff --git a/doc/distributions/fisher.qbk b/doc/distributions/fisher.qbk index 80c9a9b29..9b3a55f59 100644 --- a/doc/distributions/fisher.qbk +++ b/doc/distributions/fisher.qbk @@ -17,11 +17,11 @@ typedef RealType value_type; // Construct: - fisher_f_distribution(const RealType& i, const RealType& j); + BOOST_MATH_GPU_ENABLED fisher_f_distribution(const RealType& i, const RealType& j); // Accessors: - RealType degrees_of_freedom1()const; - RealType degrees_of_freedom2()const; + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom1()const; + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom2()const; }; }} //namespaces @@ -46,7 +46,7 @@ two degrees of freedom parameters. [h4 Member Functions] - fisher_f_distribution(const RealType& df1, const RealType& df2); + BOOST_MATH_GPU_ENABLED fisher_f_distribution(const RealType& df1, const RealType& df2); Constructs an F-distribution with numerator degrees of freedom /df1/ and denominator degrees of freedom /df2/. @@ -54,11 +54,11 @@ and denominator degrees of freedom /df2/. Requires that /df1/ and /df2/ are both greater than zero, otherwise __domain_error is called. - RealType degrees_of_freedom1()const; + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom1()const; Returns the numerator degrees of freedom parameter of the distribution. - RealType degrees_of_freedom2()const; + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom2()const; Returns the denominator degrees of freedom parameter of the distribution. @@ -66,6 +66,8 @@ Returns the denominator degrees of freedom parameter of the distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The domain of the random variable is \[0, +[infin]\]. diff --git a/doc/distributions/gamma.qbk b/doc/distributions/gamma.qbk index eefcc84a0..5f9f0c2bf 100644 --- a/doc/distributions/gamma.qbk +++ b/doc/distributions/gamma.qbk @@ -12,10 +12,10 @@ typedef RealType value_type; typedef Policy policy_type; - gamma_distribution(RealType shape, RealType scale = 1) + BOOST_MATH_GPU_ENABLED gamma_distribution(RealType shape, RealType scale = 1) - RealType shape()const; - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType shape()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; }; }} // namespaces @@ -76,7 +76,7 @@ a dedicated Erlang Distribution. [h4 Member Functions] - gamma_distribution(RealType shape, RealType scale = 1); + BOOST_MATH_GPU_ENABLED gamma_distribution(RealType shape, RealType scale = 1); Constructs a gamma distribution with shape /shape/ and scale /scale/. @@ -84,11 +84,11 @@ scale /scale/. Requires that the shape and scale parameters are greater than zero, otherwise calls __domain_error. - RealType shape()const; + BOOST_MATH_GPU_ENABLED RealType shape()const; Returns the /shape/ parameter of this distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the /scale/ parameter of this distribution. @@ -96,6 +96,8 @@ Returns the /scale/ parameter of this distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The domain of the random variable is \[0,+[infin]\]. diff --git a/doc/distributions/geometric.qbk b/doc/distributions/geometric.qbk index 7aa1a3343..038753d95 100644 --- a/doc/distributions/geometric.qbk +++ b/doc/distributions/geometric.qbk @@ -17,28 +17,28 @@ typedef RealType value_type; typedef Policy policy_type; // Constructor from success_fraction: - geometric_distribution(RealType p); + BOOST_MATH_GPU_ENABLED geometric_distribution(RealType p); // Parameter accessors: - RealType success_fraction() const; - RealType successes() const; + BOOST_MATH_GPU_ENABLED RealType success_fraction() const; + BOOST_MATH_GPU_ENABLED RealType successes() const; // Bounds on success fraction: - static RealType find_lower_bound_on_p( + BOOST_MATH_GPU_ENABLED static RealType find_lower_bound_on_p( RealType trials, RealType successes, RealType probability); // alpha - static RealType find_upper_bound_on_p( + BOOST_MATH_GPU_ENABLED static RealType find_upper_bound_on_p( RealType trials, RealType successes, RealType probability); // alpha // Estimate min/max number of trials: - static RealType find_minimum_number_of_trials( + BOOST_MATH_GPU_ENABLED static RealType find_minimum_number_of_trials( RealType k, // Number of failures. RealType p, // Success fraction. RealType probability); // Probability threshold alpha. - static RealType find_maximum_number_of_trials( + BOOST_MATH_GPU_ENABLED static RealType find_maximum_number_of_trials( RealType k, // Number of failures. RealType p, // Success fraction. RealType probability); // Probability threshold alpha. @@ -268,6 +268,8 @@ of observing more than k failures. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. However it's worth taking a moment to define what these actually mean in the context of this distribution: diff --git a/doc/distributions/inverse_chi_squared.qbk b/doc/distributions/inverse_chi_squared.qbk index 7bc75a881..8d67082d0 100644 --- a/doc/distributions/inverse_chi_squared.qbk +++ b/doc/distributions/inverse_chi_squared.qbk @@ -12,11 +12,11 @@ typedef RealType value_type; typedef Policy policy_type; - inverse_chi_squared_distribution(RealType df = 1); // Not explicitly scaled, default 1/df. - inverse_chi_squared_distribution(RealType df, RealType scale = 1/df); // Scaled. + BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1); // Not explicitly scaled, default 1/df. + BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df, RealType scale = 1/df); // Scaled. - RealType degrees_of_freedom()const; // Default 1. - RealType scale()const; // Optional scale [xi] (variance), default 1/degrees_of_freedom. + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const; // Default 1. + BOOST_MATH_GPU_ENABLED RealType scale()const; // Optional scale [xi] (variance), default 1/degrees_of_freedom. }; }} // namespace boost // namespace math @@ -99,8 +99,8 @@ varies for a few values of parameters [nu] and [xi]: [h4 Member Functions] - inverse_chi_squared_distribution(RealType df = 1); // Implicitly scaled 1/df. - inverse_chi_squared_distribution(RealType df = 1, RealType scale); // Explicitly scaled. + BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1); // Implicitly scaled 1/df. + BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1, RealType scale); // Explicitly scaled. Constructs an inverse chi_squared distribution with [nu] degrees of freedom ['df], and scale ['scale] with default value 1\/df. @@ -108,11 +108,11 @@ and scale ['scale] with default value 1\/df. Requires that the degrees of freedom [nu] parameter is greater than zero, otherwise calls __domain_error. - RealType degrees_of_freedom()const; + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const; Returns the degrees_of_freedom [nu] parameter of this distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the scale [xi] parameter of this distribution. @@ -120,6 +120,8 @@ Returns the scale [xi] parameter of this distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The domain of the random variate is \[0,+[infin]\]. [note Unlike some definitions, this implementation supports a random variate diff --git a/doc/distributions/inverse_gamma.qbk b/doc/distributions/inverse_gamma.qbk index 8fccbc19c..ee68651df 100644 --- a/doc/distributions/inverse_gamma.qbk +++ b/doc/distributions/inverse_gamma.qbk @@ -12,10 +12,10 @@ typedef RealType value_type; typedef Policy policy_type; - inverse_gamma_distribution(RealType shape, RealType scale = 1) + BOOST_MATH_GPU_ENABLED inverse_gamma_distribution(RealType shape, RealType scale = 1) - RealType shape()const; - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType shape()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; }; }} // namespaces @@ -63,18 +63,18 @@ varies as the parameters vary: [h4 Member Functions] - inverse_gamma_distribution(RealType shape = 1, RealType scale = 1); + BOOST_MATH_GPU_ENABLED inverse_gamma_distribution(RealType shape = 1, RealType scale = 1); Constructs an inverse gamma distribution with shape [alpha] and scale [beta]. Requires that the shape and scale parameters are greater than zero, otherwise calls __domain_error. - RealType shape()const; + BOOST_MATH_GPU_ENABLED RealType shape()const; Returns the [alpha] shape parameter of this inverse gamma distribution. - RealType scale()const; + BOOST_MATH_GPU_ENABLED RealType scale()const; Returns the [beta] scale parameter of this inverse gamma distribution. @@ -82,6 +82,8 @@ Returns the [beta] scale parameter of this inverse gamma distribution. All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all distributions are supported: __usual_accessors. +For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can +be run on both host and device. The domain of the random variate is \[0,+[infin]\]. [note Unlike some definitions, this implementation supports a random variate