From b1d4f142dddbc12222fa965225e95ee787708ac5 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 30 Aug 2024 15:46:17 -0400
Subject: [PATCH 01/31] Add overview

---
 doc/math.qbk         |  1 +
 doc/overview/gpu.qbk | 59 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 doc/overview/gpu.qbk
diff --git a/doc/math.qbk b/doc/math.qbk
index 30bc53339..1ea17752d 100644
--- a/doc/math.qbk
+++ b/doc/math.qbk
@@ -557,6 +557,7 @@ and as a CD ISBN 0-9504833-2-X  978-0-9504833-2-0, Classification 519.2-dc22.
 [include overview/standalone.qbk]
 [include overview/result_type_calc.qbk]
 [include overview/error_handling.qbk]
+[include overview/gpu.qbk]
 
 [section:compilers_overview Compilers]
 [compilers_overview]
diff --git a/doc/overview/gpu.qbk b/doc/overview/gpu.qbk
new file mode 100644
index 000000000..70f0164e0
--- /dev/null
+++ b/doc/overview/gpu.qbk
@@ -0,0 +1,59 @@
+[section:gpu Support for GPU programming in Boost.Math]
+
+[h4 GPU Support]
+
+Selected functions, distributions, tools, etc. support running on both host and devices.
+These functions will have the annotation `BOOST_MATH_GPU_ENABLED` next to their individual documentation.
+We test using CUDA (both NVCC and NVRTC) as well as SYCL to provide a wide range of support.
+
+[h4 How to build with device support]
+
+When compiling with CUDA or SYCL you will have to ensure that your code is being run inside of a kernel function.
+It is not enough to simply compile existing code with the NVCC compiler to run the code on the device.
+A simple CUDA kernel to run the Beta Distribution CDF on NVCC would be:
+
+    __global__ void cuda_beta_dist(const double* in, double* out, int num_elements)
+    {
+        const int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+        if (i < num_elements)
+        {
+            out[i] = cdf(boost::math::beta_distribution<double>(), in[i]);
+        }
+    }
+
+And on CUDA on NVRTC:
+
+    const char* cuda_kernel = R"(
+    #include <boost/math/distributions/beta.hpp>
+    extern "C" __global__ 
+    void test_beta_dist_kernel(const double* in, double* out, int num_elements)
+    {
+        const int i = blockDim.x * blockIdx.x + threadIdx.x;
+        if (i < num_elements)
+        {
+            out[i] = boost::math::cdf(boost::math::beta_distribution<double>(), in[i]);
+        }
+    }
+    )";
+
+And lastly on SYCL:
+
+    void sycl_beta_dist(const double* in, double* out, int num_elements, sycl::queue& q)
+    {
+        q.submit([&](sycl::handler& h) {
+            h.parallel_for(sycl::range<1>(num_elements), [=](sycl::id<1> i) {
+                out[i] = boost::math::cdf(boost::math::beta_distribution<double>(), in[i]);
+            });
+        });
+    }
+
+Once your kernel function has been written then use the framework mechanism for launching the kernel.
+
+[/ 
+  Copyright 2024. Matt Borland
+  Distributed under the Boost Software License, Version 1.0.
+  (See accompanying file LICENSE_1_0.txt or copy at
+  http://www.boost.org/LICENSE_1_0.txt).
+]
+

From cb99a775e736a06ffe538fdd63f39223309a6486 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 30 Aug 2024 15:46:33 -0400
Subject: [PATCH 02/31] Annotate GPU markers in constants doc with type caveat

---
 doc/constants/constants.qbk | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/constants/constants.qbk b/doc/constants/constants.qbk
index 24092adf5..9cce152da 100644
--- a/doc/constants/constants.qbk
+++ b/doc/constants/constants.qbk
@@ -227,6 +227,11 @@ either construct from a decimal digit string or calculate on the fly depending u
 [[Any other value ['N]][Sets the compile time precision to ['N] bits.]]
 ]
 
+[h5 GPU Support]
+
+All Boost.Math constants are marked with `BOOST_MATH_GPU_ENABLED` and can be used on both host and device.
+Note that when running on device you are limited to using only `float` and `double` types.
+
 [h5 Custom Specializing a constant]
 
 In addition, for user-defined types that need special handling, it's possible to partially-specialize

From 603ffd2adda30e1fefb7761c3236fdebe32be1e3 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 30 Aug 2024 15:46:47 -0400
Subject: [PATCH 03/31] Add GPU markers to supported dists docs

---
 doc/distributions/arcsine.qbk       | 12 +++++++-----
 doc/distributions/bernoulli.qbk     | 10 ++++++----
 doc/distributions/beta.qbk          | 22 ++++++++++++----------
 doc/distributions/cauchy.qbk        | 14 ++++++++------
 doc/distributions/chi_squared.qbk   |  8 +++++---
 doc/distributions/exponential.qbk   | 10 ++++++----
 doc/distributions/extreme_value.qbk | 14 ++++++++------
 doc/distributions/holtsmark.qbk     | 14 ++++++++------
 doc/distributions/landau.qbk        | 18 ++++++++++--------
 doc/distributions/laplace.qbk       | 14 ++++++++------
 doc/distributions/logistic.qbk      | 14 ++++++++------
 doc/distributions/mapairy.qbk       | 14 ++++++++------
 doc/distributions/saspoint5.qbk     | 14 ++++++++------
 doc/distributions/weibull.qbk       | 14 ++++++++------
 14 files changed, 110 insertions(+), 82 deletions(-)

diff --git a/doc/distributions/arcsine.qbk b/doc/distributions/arcsine.qbk
index fbd6e86b1..7930f97d5 100644
--- a/doc/distributions/arcsine.qbk
+++ b/doc/distributions/arcsine.qbk
@@ -21,11 +21,11 @@
       typedef Policy    policy_type;
 
       // Constructor from two range parameters, x_min and x_max:
-      arcsine_distribution(RealType x_min = 0, RealType x_max = 1);
+      BOOST_MATH_GPU_ENABLED arcsine_distribution(RealType x_min = 0, RealType x_max = 1);
 
       // Range Parameter accessors:
-      RealType x_min() const;
-      RealType x_max() const;
+      BOOST_MATH_GPU_ENABLED RealType x_min() const;
+      BOOST_MATH_GPU_ENABLED RealType x_max() const;
    };
    }} // namespaces
 
@@ -103,8 +103,8 @@ constructs a 'Standard 01' arcsine distribution.
 
 [h5 Parameter Accessors]
 
-   RealType x_min() const;
-   RealType x_max() const;
+   BOOST_MATH_GPU_ENABLED RealType x_min() const;
+   BOOST_MATH_GPU_ENABLED RealType x_max() const;
 
 Return the parameter ['x_min] or  ['x_max] from which this distribution was constructed.
 
@@ -116,6 +116,8 @@ So, for example:
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The formulae for calculating these are shown in the table below, and at
 [@http://mathworld.wolfram.com/arcsineDistribution.html Wolfram Mathworld].
diff --git a/doc/distributions/bernoulli.qbk b/doc/distributions/bernoulli.qbk
index 4a2fc7b61..719c42cd9 100644
--- a/doc/distributions/bernoulli.qbk
+++ b/doc/distributions/bernoulli.qbk
@@ -16,9 +16,9 @@
        typedef RealType  value_type;
        typedef Policy    policy_type;
 
-       bernoulli_distribution(RealType p); // Constructor.
+       BOOST_MATH_GPU_ENABLED bernoulli_distribution(RealType p); // Constructor.
        // Accessor function.
-       RealType success_fraction() const
+       BOOST_MATH_GPU_ENABLED RealType success_fraction() const
        // Probability of success (as a fraction).
     };
    }} // namespaces
@@ -51,12 +51,12 @@ and the [@http://en.wikipedia.org/wiki/Cumulative_Distribution_Function Cumulati
 
 [h4 Member Functions]
 
-   bernoulli_distribution(RealType p);
+   BOOST_MATH_GPU_ENABLED bernoulli_distribution(RealType p);
 
 Constructs a [@http://en.wikipedia.org/wiki/bernoulli_distribution
 bernoulli distribution] with success_fraction /p/.
 
-   RealType success_fraction() const
+   BOOST_MATH_GPU_ENABLED RealType success_fraction() const
 
 Returns the /success_fraction/ parameter of this distribution.
 
@@ -64,6 +64,8 @@ Returns the /success_fraction/ parameter of this distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The domain of the random variable is 0 and 1,
 and the useful supported range is only 0 or 1.
diff --git a/doc/distributions/beta.qbk b/doc/distributions/beta.qbk
index 95943f715..5ba1a6d1c 100644
--- a/doc/distributions/beta.qbk
+++ b/doc/distributions/beta.qbk
@@ -19,30 +19,30 @@
       typedef RealType  value_type;
       typedef Policy    policy_type;
       // Constructor from two shape parameters, alpha & beta:
-      beta_distribution(RealType a, RealType b);
+      BOOST_MATH_GPU_ENABLED beta_distribution(RealType a, RealType b);
       
       // Parameter accessors:
-      RealType alpha() const;
-      RealType beta() const;
+      BOOST_MATH_GPU_ENABLED RealType alpha() const;
+      BOOST_MATH_GPU_ENABLED RealType beta() const;
       
       // Parameter estimators of alpha or beta from mean and variance.
-      static RealType find_alpha(
+      BOOST_MATH_GPU_ENABLED static RealType find_alpha(
         RealType mean, // Expected value of mean.
         RealType variance); // Expected value of variance.
       
-      static RealType find_beta(
+      BOOST_MATH_GPU_ENABLED static RealType find_beta(
         RealType mean, // Expected value of mean.
         RealType variance); // Expected value of variance.
   
       // Parameter estimators from
       // either alpha or beta, and x and probability.
       
-      static RealType find_alpha(
+      BOOST_MATH_GPU_ENABLED static RealType find_alpha(
         RealType beta, // from beta.
         RealType x, //  x.
         RealType probability); // cdf
       
-      static RealType find_beta(
+      BOOST_MATH_GPU_ENABLED static RealType find_beta(
         RealType alpha, // alpha.
         RealType x, // probability x.
         RealType probability); // probability cdf.
@@ -98,7 +98,7 @@ whose apex is away from the centre (where x = half).
 
 [h5 Constructor]
 
-   beta_distribution(RealType alpha, RealType beta);
+   BOOST_MATH_GPU_ENABLED beta_distribution(RealType alpha, RealType beta);
 
 Constructs a beta distribution with shape parameters /alpha/ and /beta/.
 
@@ -117,11 +117,11 @@ in the graph above).
 
 [h5 Parameter Accessors]
 
-   RealType alpha() const;
+   BOOST_MATH_GPU_ENABLED RealType alpha() const;
    
 Returns the parameter /alpha/ from which this distribution was constructed.
    
-   RealType beta() const;
+   BOOST_MATH_GPU_ENABLED RealType beta() const;
    
 Returns the parameter /beta/ from which this distribution was constructed.
 
@@ -182,6 +182,8 @@ Returns the value of [beta] that gives:
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The formulae for calculating these are shown in the table below, and at
 [@http://mathworld.wolfram.com/BetaDistribution.html Wolfram Mathworld].
diff --git a/doc/distributions/cauchy.qbk b/doc/distributions/cauchy.qbk
index 6ae090818..4a177d294 100644
--- a/doc/distributions/cauchy.qbk
+++ b/doc/distributions/cauchy.qbk
@@ -15,10 +15,10 @@
       typedef RealType  value_type;
       typedef Policy    policy_type;
 
-      cauchy_distribution(RealType location = 0, RealType scale = 1);
+      BOOST_MATH_GPU_ENABLED cauchy_distribution(RealType location = 0, RealType scale = 1);
       
-      RealType location()const;
-      RealType scale()const;
+      BOOST_MATH_GPU_ENABLED RealType location()const;
+      BOOST_MATH_GPU_ENABLED RealType scale()const;
    };
    
 The [@http://en.wikipedia.org/wiki/Cauchy_distribution Cauchy-Lorentz distribution]
@@ -53,7 +53,7 @@ the distribution:
 
 [h4 Member Functions]
 
-   cauchy_distribution(RealType location = 0, RealType scale = 1);
+   BOOST_MATH_GPU_ENABLED cauchy_distribution(RealType location = 0, RealType scale = 1);
    
 Constructs a Cauchy distribution, with location parameter /location/
 and scale parameter /scale/.  When these parameters take their default
@@ -62,11 +62,11 @@ then the result is a Standard Cauchy Distribution.
 
 Requires scale > 0, otherwise calls __domain_error.
    
-   RealType location()const;
+   BOOST_MATH_GPU_ENABLED RealType location()const;
    
 Returns the location parameter of the distribution.
    
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
    
 Returns the scale parameter of the distribution.
 
@@ -74,6 +74,8 @@ Returns the scale parameter of the distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 Note however that the Cauchy distribution does not have a mean,
 standard deviation, etc. See __math_undefined
diff --git a/doc/distributions/chi_squared.qbk b/doc/distributions/chi_squared.qbk
index 753e1f401..b52d4d392 100644
--- a/doc/distributions/chi_squared.qbk
+++ b/doc/distributions/chi_squared.qbk
@@ -18,13 +18,13 @@
       typedef Policy    policy_type;
 
       // Constructor:
-      chi_squared_distribution(RealType i);
+      BOOST_MATH_GPU_ENABLED chi_squared_distribution(RealType i);
 
       // Accessor to parameter:
-      RealType degrees_of_freedom()const;
+      BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const;
 
       // Parameter estimation:
-      static RealType find_degrees_of_freedom(
+      BOOST_MATH_GPU_ENABLED static RealType find_degrees_of_freedom(
          RealType difference_from_mean,
          RealType alpha,
          RealType beta,
@@ -104,6 +104,8 @@ See also section on Sample sizes required in
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 (We have followed the usual restriction of the mode to degrees of freedom >= 2,
 but note that the maximum of the pdf is actually zero for degrees of freedom from 2 down to 0,
diff --git a/doc/distributions/exponential.qbk b/doc/distributions/exponential.qbk
index 043818b4a..5214df7d4 100644
--- a/doc/distributions/exponential.qbk
+++ b/doc/distributions/exponential.qbk
@@ -15,9 +15,9 @@
       typedef RealType value_type;
       typedef Policy   policy_type;
 
-      exponential_distribution(RealType lambda = 1);
+      BOOST_MATH_GPU_ENABLED exponential_distribution(RealType lambda = 1);
 
-      RealType lambda()const;
+      BOOST_MATH_GPU_ENABLED RealType lambda()const;
    };
 
 
@@ -37,7 +37,7 @@ values of the rate parameter lambda:
 
 [h4 Member Functions]
 
-   exponential_distribution(RealType lambda = 1);
+   BOOST_MATH_GPU_ENABLED exponential_distribution(RealType lambda = 1);
    
 Constructs an
 [@http://en.wikipedia.org/wiki/Exponential_distribution Exponential distribution]
@@ -46,7 +46,7 @@ Lambda is defined as the reciprocal of the scale parameter.
 
 Requires lambda > 0, otherwise calls __domain_error.
 
-   RealType lambda()const;
+   BOOST_MATH_GPU_ENABLED RealType lambda()const;
    
 Accessor function returns the lambda parameter of the distribution.
    
@@ -54,6 +54,8 @@ Accessor function returns the lambda parameter of the distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The domain of the random variable is \[0, +[infin]\].
 
diff --git a/doc/distributions/extreme_value.qbk b/doc/distributions/extreme_value.qbk
index 314917ebc..bc4e27039 100644
--- a/doc/distributions/extreme_value.qbk
+++ b/doc/distributions/extreme_value.qbk
@@ -14,10 +14,10 @@
    public:
       typedef RealType value_type;
 
-      extreme_value_distribution(RealType location = 0, RealType scale = 1);
+      BOOST_MATH_GPU_ENABLED extreme_value_distribution(RealType location = 0, RealType scale = 1);
 
-      RealType scale()const;
-      RealType location()const;
+      BOOST_MATH_GPU_ENABLED RealType scale()const;
+      BOOST_MATH_GPU_ENABLED RealType location()const;
    };
 
 There are various
@@ -59,18 +59,18 @@ And this graph illustrates how the PDF varies with the shape parameter:
 
 [h4 Member Functions]
 
-   extreme_value_distribution(RealType location = 0, RealType scale = 1);
+   BOOST_MATH_GPU_ENABLED extreme_value_distribution(RealType location = 0, RealType scale = 1);
    
 Constructs an Extreme Value distribution with the specified location and scale
 parameters.
 
 Requires `scale > 0`, otherwise calls __domain_error.
 
-   RealType location()const;
+   BOOST_MATH_GPU_ENABLED RealType location()const;
    
 Returns the location parameter of the distribution.
    
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
    
 Returns the scale parameter of the distribution.
    
@@ -78,6 +78,8 @@ Returns the scale parameter of the distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The domain of the random parameter is \[-[infin], +[infin]\].
 
diff --git a/doc/distributions/holtsmark.qbk b/doc/distributions/holtsmark.qbk
index 49149ab92..39c42ff13 100644
--- a/doc/distributions/holtsmark.qbk
+++ b/doc/distributions/holtsmark.qbk
@@ -15,10 +15,10 @@
       typedef RealType  value_type;
       typedef Policy    policy_type;
 
-      holtsmark_distribution(RealType location = 0, RealType scale = 1);
+      BOOST_MATH_GPU_ENABLED holtsmark_distribution(RealType location = 0, RealType scale = 1);
 
-      RealType location()const;
-      RealType scale()const;
+      BOOST_MATH_GPU_ENABLED RealType location()const;
+      BOOST_MATH_GPU_ENABLED RealType scale()const;
    };
 
 The [@http://en.wikipedia.org/wiki/holtsmark_distribution Holtsmark distribution]
@@ -51,7 +51,7 @@ the distribution:
 
 [h4 Member Functions]
 
-   holtsmark_distribution(RealType location = 0, RealType scale = 1);
+   BOOST_MATH_GPU_ENABLED holtsmark_distribution(RealType location = 0, RealType scale = 1);
 
 Constructs a holtsmark distribution, with location parameter /location/
 and scale parameter /scale/.  When these parameters take their default
@@ -60,11 +60,11 @@ then the result is a Standard holtsmark Distribution.
 
 Requires scale > 0, otherwise calls __domain_error.
 
-   RealType location()const;
+   BOOST_MATH_GPU_ENABLED RealType location()const;
 
 Returns the location parameter of the distribution.
 
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
 
 Returns the scale parameter of the distribution.
 
@@ -72,6 +72,8 @@ Returns the scale parameter of the distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 Note however that the holtsmark distribution does not have a skewness,
 kurtosis, etc. See __math_undefined
diff --git a/doc/distributions/landau.qbk b/doc/distributions/landau.qbk
index b73450504..90dced0aa 100644
--- a/doc/distributions/landau.qbk
+++ b/doc/distributions/landau.qbk
@@ -15,11 +15,11 @@
       typedef RealType  value_type;
       typedef Policy    policy_type;
 
-      landau_distribution(RealType location = 0, RealType scale = 1);
+      BOOST_MATH_GPU_ENABLED landau_distribution(RealType location = 0, RealType scale = 1);
 
-      RealType location()const;
-      RealType scale()const;
-      RealType bias()const;
+      BOOST_MATH_GPU_ENABLED RealType location()const;
+      BOOST_MATH_GPU_ENABLED RealType scale()const;
+      BOOST_MATH_GPU_ENABLED RealType bias()const;
    };
 
 The [@http://en.wikipedia.org/wiki/landau_distribution Landau distribution]
@@ -54,7 +54,7 @@ the distribution:
 
 [h4 Member Functions]
 
-   landau_distribution(RealType location = 0, RealType scale = 1);
+   BOOST_MATH_GPU_ENABLED landau_distribution(RealType location = 0, RealType scale = 1);
 
 Constructs a landau distribution, with location parameter /location/
 and scale parameter /scale/.  When these parameters take their default
@@ -63,15 +63,15 @@ then the result is a Standard landau Distribution.
 
 Requires scale > 0, otherwise calls __domain_error.
 
-   RealType location()const;
+   BOOST_MATH_GPU_ENABLED RealType location()const;
 
 Returns the location parameter of the distribution.
 
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
 
 Returns the scale parameter of the distribution.
 
-   RealType bias()const;
+   BOOST_MATH_GPU_ENABLED RealType bias()const;
 
 Returns the amount of translation by the scale parameter.
 [expression bias = - 2 / [pi] log(c)]
@@ -80,6 +80,8 @@ Returns the amount of translation by the scale parameter.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 Note however that the landau distribution does not have a mean,
 standard deviation, etc. See __math_undefined
diff --git a/doc/distributions/laplace.qbk b/doc/distributions/laplace.qbk
index 93327e022..861c513f4 100644
--- a/doc/distributions/laplace.qbk
+++ b/doc/distributions/laplace.qbk
@@ -17,10 +17,10 @@
       typedef RealType value_type;
       typedef Policy   policy_type;
       // Construct:
-      laplace_distribution(RealType location = 0, RealType scale = 1);
+      BOOST_MATH_GPU_ENABLED laplace_distribution(RealType location = 0, RealType scale = 1);
       // Accessors:
-      RealType location()const;
-      RealType scale()const;
+      BOOST_MATH_GPU_ENABLED RealType location()const;
+      BOOST_MATH_GPU_ENABLED RealType scale()const;
    };
 
    }} // namespaces
@@ -49,7 +49,7 @@ Note that the domain of the random variable remains
 
 [h4 Member Functions]
 
-   laplace_distribution(RealType location = 0, RealType scale = 1);
+   BOOST_MATH_GPU_ENABLED laplace_distribution(RealType location = 0, RealType scale = 1);
 
 Constructs a laplace distribution with location /location/ and
 scale /scale/.
@@ -61,11 +61,11 @@ The scale parameter is proportional to the standard deviation of the random vari
 Requires that the scale parameter is greater than zero, otherwise calls
 __domain_error.
 
-   RealType location()const;
+   BOOST_MATH_GPU_ENABLED RealType location()const;
 
 Returns the /location/ parameter of this distribution.
 
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
 
 Returns the /scale/ parameter of this distribution.
 
@@ -73,6 +73,8 @@ Returns the /scale/ parameter of this distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all
 distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The domain of the random variable is \[-[infin],+[infin]\].
 
diff --git a/doc/distributions/logistic.qbk b/doc/distributions/logistic.qbk
index 0a22b48d4..68557eb01 100644
--- a/doc/distributions/logistic.qbk
+++ b/doc/distributions/logistic.qbk
@@ -15,10 +15,10 @@
       typedef RealType value_type;
       typedef Policy   policy_type;
       // Construct:
-      logistic_distribution(RealType location = 0, RealType scale = 1);
+      BOOST_MATH_GPU_ENABLED logistic_distribution(RealType location = 0, RealType scale = 1);
       // Accessors:
-      RealType location()const; // location.
-      RealType scale()const; // scale.
+      BOOST_MATH_GPU_ENABLED RealType location()const; // location.
+      BOOST_MATH_GPU_ENABLED RealType scale()const; // scale.
       
    };
 
@@ -39,17 +39,17 @@ parameters change:
 
 [h4 Member Functions]
 
-   logistic_distribution(RealType u = 0, RealType s = 1);
+   BOOST_MATH_GPU_ENABLED logistic_distribution(RealType u = 0, RealType s = 1);
 
 Constructs a logistic distribution with location /u/ and scale /s/.
 
 Requires `scale > 0`, otherwise a __domain_error is raised.
 
-   RealType location()const;   
+   BOOST_MATH_GPU_ENABLED RealType location()const;   
 
 Returns the location of this distribution.
 
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
 
 Returns the scale of this distribution. 
 
@@ -57,6 +57,8 @@ Returns the scale of this distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The domain of the random variable is \[-\[max_value\], +\[min_value\]\]. 
 However, the pdf and cdf support inputs of +[infin] and -[infin]
diff --git a/doc/distributions/mapairy.qbk b/doc/distributions/mapairy.qbk
index 97d624a93..817fb980d 100644
--- a/doc/distributions/mapairy.qbk
+++ b/doc/distributions/mapairy.qbk
@@ -15,10 +15,10 @@
       typedef RealType  value_type;
       typedef Policy    policy_type;
 
-      mapairy_distribution(RealType location = 0, RealType scale = 1);
+      BOOST_MATH_GPU_ENABLED mapairy_distribution(RealType location = 0, RealType scale = 1);
 
-      RealType location()const;
-      RealType scale()const;
+      BOOST_MATH_GPU_ENABLED RealType location()const;
+      BOOST_MATH_GPU_ENABLED RealType scale()const;
    };
 
 It is special case of a [@http://en.wikipedia.org/wiki/Stable_distribution stable distribution]
@@ -50,7 +50,7 @@ the distribution:
 
 [h4 Member Functions]
 
-   mapairy_distribution(RealType location = 0, RealType scale = 1);
+   BOOST_MATH_GPU_ENABLED mapairy_distribution(RealType location = 0, RealType scale = 1);
 
 Constructs a mapairy distribution, with location parameter /location/
 and scale parameter /scale/.  When these parameters take their default
@@ -59,11 +59,11 @@ then the result is a Standard map-airy Distribution.
 
 Requires scale > 0, otherwise calls __domain_error.
 
-   RealType location()const;
+   BOOST_MATH_GPU_ENABLED RealType location()const;
 
 Returns the location parameter of the distribution.
 
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
 
 Returns the scale parameter of the distribution.
 
@@ -71,6 +71,8 @@ Returns the scale parameter of the distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 Note however that the map-airy distribution does not have a skewness,
 kurtosis, etc. See __math_undefined
diff --git a/doc/distributions/saspoint5.qbk b/doc/distributions/saspoint5.qbk
index 1421b5bac..06efbd32e 100644
--- a/doc/distributions/saspoint5.qbk
+++ b/doc/distributions/saspoint5.qbk
@@ -15,10 +15,10 @@
       typedef RealType  value_type;
       typedef Policy    policy_type;
 
-      saspoint5_distribution(RealType location = 0, RealType scale = 1);
+      BOOST_MATH_GPU_ENABLED saspoint5_distribution(RealType location = 0, RealType scale = 1);
 
-      RealType location()const;
-      RealType scale()const;
+      BOOST_MATH_GPU_ENABLED RealType location()const;
+      BOOST_MATH_GPU_ENABLED RealType scale()const;
    };
 
 It is special case of a [@http://en.wikipedia.org/wiki/Stable_distribution stable distribution]
@@ -49,7 +49,7 @@ the distribution:
 
 [h4 Member Functions]
 
-   saspoint5_distribution(RealType location = 0, RealType scale = 1);
+   BOOST_MATH_GPU_ENABLED saspoint5_distribution(RealType location = 0, RealType scale = 1);
 
 Constructs a S[alpha]S Point5 distribution, with location parameter /location/
 and scale parameter /scale/.  When these parameters take their default
@@ -58,11 +58,11 @@ then the result is a Standard S[alpha]S Point5 Distribution.
 
 Requires scale > 0, otherwise calls __domain_error.
 
-   RealType location()const;
+   BOOST_MATH_GPU_ENABLED RealType location()const;
 
 Returns the location parameter of the distribution.
 
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
 
 Returns the scale parameter of the distribution.
 
@@ -70,6 +70,8 @@ Returns the scale parameter of the distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 Note however that the S[alpha]S Point5 distribution does not have a mean,
 standard deviation, etc. See __math_undefined
diff --git a/doc/distributions/weibull.qbk b/doc/distributions/weibull.qbk
index 95c9e461e..5d7c11b5f 100644
--- a/doc/distributions/weibull.qbk
+++ b/doc/distributions/weibull.qbk
@@ -17,10 +17,10 @@
       typedef RealType value_type;
       typedef Policy   policy_type;
       // Construct:
-      weibull_distribution(RealType shape, RealType scale = 1)
+      BOOST_MATH_GPU_ENABLED weibull_distribution(RealType shape, RealType scale = 1)
       // Accessors:
-      RealType shape()const;
-      RealType scale()const;
+      BOOST_MATH_GPU_ENABLED RealType shape()const;
+      BOOST_MATH_GPU_ENABLED RealType scale()const;
    };
    
    }} // namespaces
@@ -65,7 +65,7 @@ Samuel Kotz & Saralees Nadarajah].
    
 [h4 Member Functions]
 
-   weibull_distribution(RealType shape, RealType scale = 1);
+   BOOST_MATH_GPU_ENABLED weibull_distribution(RealType shape, RealType scale = 1);
    
 Constructs a [@http://en.wikipedia.org/wiki/Weibull_distribution 
 Weibull distribution] with shape /shape/ and scale /scale/.
@@ -73,11 +73,11 @@ Weibull distribution] with shape /shape/ and scale /scale/.
 Requires that the /shape/ and /scale/ parameters are both greater than zero, 
 otherwise calls __domain_error.
 
-   RealType shape()const;
+   BOOST_MATH_GPU_ENABLED RealType shape()const;
    
 Returns the /shape/ parameter of this distribution.
    
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
       
 Returns the /scale/ parameter of this distribution.
 
@@ -85,6 +85,8 @@ Returns the /scale/ parameter of this distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all
 distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The domain of the random variable is \[0, [infin]\].
 

From 539c81b15ac4f604b36eac06502716bf9dadbb03 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 30 Aug 2024 16:08:41 -0400
Subject: [PATCH 04/31] Add markers to special functions

---
 doc/overview/gpu.qbk         |  5 +++
 doc/sf/bessel_ik.qbk         |  8 ++---
 doc/sf/bessel_jy.qbk         |  8 ++---
 doc/sf/bessel_spherical.qbk  |  8 ++---
 doc/sf/beta.qbk              |  4 +--
 doc/sf/beta_derivative.qbk   |  4 +--
 doc/sf/digamma.qbk           |  4 +--
 doc/sf/erf.qbk               | 16 ++++-----
 doc/sf/erf_inv.qbk           | 16 ++++-----
 doc/sf/gamma_derivatives.qbk |  4 +--
 doc/sf/gamma_ratios.qbk      | 16 ++++-----
 doc/sf/ibeta.qbk             | 32 +++++++++---------
 doc/sf/ibeta_inv.qbk         | 64 ++++++++++++++++++------------------
 doc/sf/igamma.qbk            | 32 +++++++++---------
 doc/sf/igamma_inv.qbk        | 32 +++++++++---------
 doc/sf/lgamma.qbk            |  8 ++---
 doc/sf/pow.qbk               |  4 +--
 doc/sf/sinc.qbk              | 12 +++----
 doc/sf/tgamma.qbk            | 16 ++++-----
 doc/sf/trigamma.qbk          |  4 +--
 20 files changed, 151 insertions(+), 146 deletions(-)

diff --git a/doc/overview/gpu.qbk b/doc/overview/gpu.qbk
index 70f0164e0..18ebaba2a 100644
--- a/doc/overview/gpu.qbk
+++ b/doc/overview/gpu.qbk
@@ -6,6 +6,11 @@ Selected functions, distributions, tools, etc. support running on both host and
 These functions will have the annotation `BOOST_MATH_GPU_ENABLED` next to their individual documentation.
 We test using CUDA (both NVCC and NVRTC) as well as SYCL to provide a wide range of support.
 
+[h4 Policies]
+
+The default policy on all devices is ignore error due to the lack of throwing ability.
+A user can specify their own policy like usual, but when the code is run on device it will be ignored.
+
 [h4 How to build with device support]
 
 When compiling with CUDA or SYCL you will have to ensure that your code is being run inside of a kernel function.
diff --git a/doc/sf/bessel_ik.qbk b/doc/sf/bessel_ik.qbk
index d044ac7b8..9fa4e63a7 100644
--- a/doc/sf/bessel_ik.qbk
+++ b/doc/sf/bessel_ik.qbk
@@ -5,16 +5,16 @@
 `#include <boost/math/special_functions/bessel.hpp>`
 
    template <class T1, class T2>
-   ``__sf_result`` cyl_bessel_i(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_i(T1 v, T2 x);
 
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` cyl_bessel_i(T1 v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_i(T1 v, T2 x, const ``__Policy``&);
 
    template <class T1, class T2>
-   ``__sf_result`` cyl_bessel_k(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_k(T1 v, T2 x);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` cyl_bessel_k(T1 v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_k(T1 v, T2 x, const ``__Policy``&);
    
    
 [h4 Description]
diff --git a/doc/sf/bessel_jy.qbk b/doc/sf/bessel_jy.qbk
index 1f43bc758..faf878850 100644
--- a/doc/sf/bessel_jy.qbk
+++ b/doc/sf/bessel_jy.qbk
@@ -5,16 +5,16 @@
 `#include <boost/math/special_functions/bessel.hpp>`
 
    template <class T1, class T2>
-   ``__sf_result`` cyl_bessel_j(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_j(T1 v, T2 x);
 
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` cyl_bessel_j(T1 v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_bessel_j(T1 v, T2 x, const ``__Policy``&);
 
    template <class T1, class T2>
-   ``__sf_result`` cyl_neumann(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_neumann(T1 v, T2 x);
 
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` cyl_neumann(T1 v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` cyl_neumann(T1 v, T2 x, const ``__Policy``&);
 
 
 [h4 Description]
diff --git a/doc/sf/bessel_spherical.qbk b/doc/sf/bessel_spherical.qbk
index e9cda89c7..eb1fa6915 100644
--- a/doc/sf/bessel_spherical.qbk
+++ b/doc/sf/bessel_spherical.qbk
@@ -5,16 +5,16 @@
 `#include <boost/math/special_functions/bessel.hpp>`
 
    template <class T1, class T2>
-   ``__sf_result`` sph_bessel(unsigned v, T2 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` sph_bessel(unsigned v, T2 x);
 
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` sph_bessel(unsigned v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` sph_bessel(unsigned v, T2 x, const ``__Policy``&);
 
    template <class T1, class T2>
-   ``__sf_result`` sph_neumann(unsigned v, T2 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` sph_neumann(unsigned v, T2 x);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` sph_neumann(unsigned v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` sph_neumann(unsigned v, T2 x, const ``__Policy``&);
    
 [h4 Description]
 
diff --git a/doc/sf/beta.qbk b/doc/sf/beta.qbk
index e332fa503..7e1904c25 100644
--- a/doc/sf/beta.qbk
+++ b/doc/sf/beta.qbk
@@ -9,10 +9,10 @@
    namespace boost{ namespace math{
    
    template <class T1, class T2>
-   ``__sf_result`` beta(T1 a, T2 b);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` beta(T1 a, T2 b, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b, const ``__Policy``&);
    
    }} // namespaces
 
diff --git a/doc/sf/beta_derivative.qbk b/doc/sf/beta_derivative.qbk
index 8606d6f2b..5d3b9a13e 100644
--- a/doc/sf/beta_derivative.qbk
+++ b/doc/sf/beta_derivative.qbk
@@ -9,10 +9,10 @@
    namespace boost{ namespace math{ 
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibeta_derivative(T1 a, T2 b, T3 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_derivative(T1 a, T2 b, T3 x);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibeta_derivative(T1 a, T2 b, T3 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_derivative(T1 a, T2 b, T3 x, const ``__Policy``&);
    
    }} // namespaces
    
diff --git a/doc/sf/digamma.qbk b/doc/sf/digamma.qbk
index c88c5fe7b..78b68403d 100644
--- a/doc/sf/digamma.qbk
+++ b/doc/sf/digamma.qbk
@@ -9,10 +9,10 @@
   namespace boost{ namespace math{
   
   template <class T>
-  ``__sf_result`` digamma(T z);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` digamma(T z);
   
   template <class T, class ``__Policy``>
-  ``__sf_result`` digamma(T z, const ``__Policy``&);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` digamma(T z, const ``__Policy``&);
   
   }} // namespaces
   
diff --git a/doc/sf/erf.qbk b/doc/sf/erf.qbk
index 3207b66c0..5f6bdf9fa 100644
--- a/doc/sf/erf.qbk
+++ b/doc/sf/erf.qbk
@@ -9,16 +9,16 @@
    namespace boost{ namespace math{
    
    template <class T>
-   ``__sf_result`` erf(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erf(T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` erf(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erf(T z, const ``__Policy``&);
    
    template <class T>
-   ``__sf_result`` erfc(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc(T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` erfc(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc(T z, const ``__Policy``&);
    
    }} // namespaces
    
@@ -30,10 +30,10 @@ the return type is `double` if T is an integer type, and T otherwise.
 [h4 Description]
 
    template <class T>
-   ``__sf_result`` erf(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erf(T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` erf(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erf(T z, const ``__Policy``&);
    
 Returns the [@http://en.wikipedia.org/wiki/Error_function error function]
 [@http://functions.wolfram.com/GammaBetaErf/Erf/ erf] of z:
@@ -43,10 +43,10 @@ Returns the [@http://en.wikipedia.org/wiki/Error_function error function]
 [graph erf]
 
    template <class T>
-   ``__sf_result`` erfc(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc(T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` erfc(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc(T z, const ``__Policy``&);
    
 Returns the complement of the [@http://functions.wolfram.com/GammaBetaErf/Erfc/ error function] of z:
 
diff --git a/doc/sf/erf_inv.qbk b/doc/sf/erf_inv.qbk
index 729ec22d2..e8f7464e0 100644
--- a/doc/sf/erf_inv.qbk
+++ b/doc/sf/erf_inv.qbk
@@ -9,16 +9,16 @@
    namespace boost{ namespace math{
    
    template <class T>
-   ``__sf_result`` erf_inv(T p);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erf_inv(T p);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` erf_inv(T p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erf_inv(T p, const ``__Policy``&);
    
    template <class T>
-   ``__sf_result`` erfc_inv(T p);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc_inv(T p);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` erfc_inv(T p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc_inv(T p, const ``__Policy``&);
    
    }} // namespaces
    
@@ -30,10 +30,10 @@ the return type is `double` if T is an integer type, and T otherwise.
 [h4 Description]
 
    template <class T>
-   ``__sf_result`` erf_inv(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erf_inv(T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` erf_inv(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erf_inv(T z, const ``__Policy``&);
    
 Returns the [@http://functions.wolfram.com/GammaBetaErf/InverseErf/ inverse error function]
 of z, that is a value x such that:
@@ -43,10 +43,10 @@ of z, that is a value x such that:
 [graph erf_inv]
 
    template <class T>
-   ``__sf_result`` erfc_inv(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc_inv(T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` erfc_inv(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` erfc_inv(T z, const ``__Policy``&);
    
 Returns the inverse of the complement of the error function of z, that is a
 value x such that:
diff --git a/doc/sf/gamma_derivatives.qbk b/doc/sf/gamma_derivatives.qbk
index c7dd24879..1b578d8d9 100644
--- a/doc/sf/gamma_derivatives.qbk
+++ b/doc/sf/gamma_derivatives.qbk
@@ -9,10 +9,10 @@
    namespace boost{ namespace math{ 
    
    template <class T1, class T2>
-   ``__sf_result`` gamma_p_derivative(T1 a, T2 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_derivative(T1 a, T2 x);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_p_derivative(T1 a, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_derivative(T1 a, T2 x, const ``__Policy``&);
    
    }} // namespaces
    
diff --git a/doc/sf/gamma_ratios.qbk b/doc/sf/gamma_ratios.qbk
index a3fcf864c..0d076890d 100644
--- a/doc/sf/gamma_ratios.qbk
+++ b/doc/sf/gamma_ratios.qbk
@@ -7,26 +7,26 @@
    namespace boost{ namespace math{
    
    template <class T1, class T2>
-   ``__sf_result`` tgamma_ratio(T1 a, T2 b);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_ratio(T1 a, T2 b);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` tgamma_ratio(T1 a, T2 b, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_ratio(T1 a, T2 b, const ``__Policy``&);
    
    template <class T1, class T2>
-   ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta);
    
    template <class T1, class T2, class Policy>
-   ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta, const ``__Policy``&);
    
    }} // namespaces
    
 [h4 Description]
 
    template <class T1, class T2> 
-   ``__sf_result`` tgamma_ratio(T1 a, T2 b);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_ratio(T1 a, T2 b);
    
    template <class T1, class T2, class ``__Policy``> 
-   ``__sf_result`` tgamma_ratio(T1 a, T2 b, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_ratio(T1 a, T2 b, const ``__Policy``&);
    
 Returns the ratio of gamma functions:
 
@@ -37,10 +37,10 @@ Returns the ratio of gamma functions:
 Internally this just calls `tgamma_delta_ratio(a, b-a)`.
    
    template <class T1, class T2>
-   ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_delta_ratio(T1 a, T2 delta, const ``__Policy``&);
    
 Returns the ratio of gamma functions:
 
diff --git a/doc/sf/ibeta.qbk b/doc/sf/ibeta.qbk
index b4a20f928..5227b2d34 100644
--- a/doc/sf/ibeta.qbk
+++ b/doc/sf/ibeta.qbk
@@ -9,28 +9,28 @@
    namespace boost{ namespace math{
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibeta(T1 a, T2 b, T3 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta(T1 a, T2 b, T3 x);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibeta(T1 a, T2 b, T3 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta(T1 a, T2 b, T3 x, const ``__Policy``&);
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibetac(T1 a, T2 b, T3 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac(T1 a, T2 b, T3 x);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibetac(T1 a, T2 b, T3 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac(T1 a, T2 b, T3 x, const ``__Policy``&);
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` beta(T1 a, T2 b, T3 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b, T3 x);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` beta(T1 a, T2 b, T3 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b, T3 x, const ``__Policy``&);
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` betac(T1 a, T2 b, T3 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` betac(T1 a, T2 b, T3 x);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` betac(T1 a, T2 b, T3 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` betac(T1 a, T2 b, T3 x, const ``__Policy``&);
    
    }} // namespaces
    
@@ -57,10 +57,10 @@ when T1, T2 and T3 are different types.
 [optional_policy]
 
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibeta(T1 a, T2 b, T3 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta(T1 a, T2 b, T3 x);
 
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibeta(T1 a, T2 b, T3 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta(T1 a, T2 b, T3 x, const ``__Policy``&);
 
 Returns the normalised incomplete beta function of a, b and x:
 
@@ -69,30 +69,30 @@ Returns the normalised incomplete beta function of a, b and x:
 [graph ibeta]
 
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibetac(T1 a, T2 b, T3 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac(T1 a, T2 b, T3 x);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibetac(T1 a, T2 b, T3 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac(T1 a, T2 b, T3 x, const ``__Policy``&);
    
 Returns the normalised complement of the incomplete beta function of a, b and x:
 
 [equation ibeta4]
 
    template <class T1, class T2, class T3>
-   ``__sf_result`` beta(T1 a, T2 b, T3 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b, T3 x);
 
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` beta(T1 a, T2 b, T3 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` beta(T1 a, T2 b, T3 x, const ``__Policy``&);
 
 Returns the full (non-normalised) incomplete beta function of a, b and x:
 
 [equation ibeta1]
 
    template <class T1, class T2, class T3>
-   ``__sf_result`` betac(T1 a, T2 b, T3 x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` betac(T1 a, T2 b, T3 x);
 
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` betac(T1 a, T2 b, T3 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` betac(T1 a, T2 b, T3 x, const ``__Policy``&);
 
 Returns the full (non-normalised) complement of the incomplete beta function of a, b and x:
 
diff --git a/doc/sf/ibeta_inv.qbk b/doc/sf/ibeta_inv.qbk
index 83c2b0008..60049db46 100644
--- a/doc/sf/ibeta_inv.qbk
+++ b/doc/sf/ibeta_inv.qbk
@@ -7,52 +7,52 @@
    namespace boost{ namespace math{
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, const ``__Policy``&);
    
    template <class T1, class T2, class T3, class T4>
-   ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py);
    
    template <class T1, class T2, class T3, class T4, class ``__Policy``>
-   ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py, const ``__Policy``&);
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, const ``__Policy``&);
    
    template <class T1, class T2, class T3, class T4>
-   ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py);
    
    template <class T1, class T2, class T3, class T4, class ``__Policy``>
-   ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py, const ``__Policy``&);
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p, const ``__Policy``&);
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 q);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 q);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 q, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 q, const ``__Policy``&);
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibeta_invb(T1 a, T2 x, T3 p);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_invb(T1 a, T2 x, T3 p);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibeta_invb(T1 a, T2 x, T3 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_invb(T1 a, T2 x, T3 p, const ``__Policy``&);
    
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibetac_invb(T1 a, T2 x, T3 q);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_invb(T1 a, T2 x, T3 q);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibetac_invb(T1 a, T2 x, T3 q, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_invb(T1 a, T2 x, T3 q, const ``__Policy``&);
    
    }} // namespaces
    
@@ -81,16 +81,16 @@ The return type of these functions is computed using the __arg_promotion_rules
 when called with arguments T1...TN of different types.
 
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, const ``__Policy``&);
    
    template <class T1, class T2, class T3, class T4>
-   ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py);
    
    template <class T1, class T2, class T3, class T4, class ``__Policy``>
-   ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibeta_inv(T1 a, T2 b, T3 p, T4* py, const ``__Policy``&);
    
 Returns a value /x/ such that: `p = ibeta(a, b, x);` 
 and sets `*py = 1 - x` when the `py` parameter is provided and is non-null.  
@@ -104,16 +104,16 @@ Requires:  /a,b > 0/ and /0 <= p <= 1/.
 [optional_policy]
 
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q);
+   BOOST_MATH_GPU_ENABLED``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, const ``__Policy``&);
    
    template <class T1, class T2, class T3, class T4>
-   ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py);
    
    template <class T1, class T2, class T3, class T4, class ``__Policy``>
-   ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_inv(T1 a, T2 b, T3 q, T4* py, const ``__Policy``&);
    
 Returns a value /x/ such that: `q = ibetac(a, b, x);`
 and sets `*py = 1 - x` when the `py` parameter is provided and is non-null.  
@@ -127,10 +127,10 @@ Requires:  /a,b > 0/ and /0 <= q <= 1/.
 [optional_policy]
 
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p);
+   BOOST_MATH_GPU_ENABLED``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p);
 
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED``__sf_result`` ibeta_inva(T1 b, T2 x, T3 p, const ``__Policy``&);
 
 Returns a value /a/ such that: `p = ibeta(a, b, x);`
 
@@ -139,10 +139,10 @@ Requires:  /b > 0/, /0 < x < 1/ and /0 <= p <= 1/.
 [optional_policy]
 
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 p);
+   BOOST_MATH_GPU_ENABLED``__sf_result`` ibetac_inva(T1 b, T2 x, T3 p);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibetac_inva(T1 b, T2 x, T3 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED``__sf_result`` ibetac_inva(T1 b, T2 x, T3 p, const ``__Policy``&);
    
 Returns a value /a/ such that: `q = ibetac(a, b, x);`
 
@@ -151,10 +151,10 @@ Requires:  /b > 0/, /0 < x < 1/ and /0 <= q <= 1/.
 [optional_policy]
 
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibeta_invb(T1 b, T2 x, T3 p);
+   BOOST_MATH_GPU_ENABLED``__sf_result`` ibeta_invb(T1 b, T2 x, T3 p);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibeta_invb(T1 b, T2 x, T3 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED``__sf_result`` ibeta_invb(T1 b, T2 x, T3 p, const ``__Policy``&);
 
 Returns a value /b/ such that: `p = ibeta(a, b, x);`
 
@@ -163,10 +163,10 @@ Requires:  /a > 0/, /0 < x < 1/ and /0 <= p <= 1/.
 [optional_policy]
 
    template <class T1, class T2, class T3>
-   ``__sf_result`` ibetac_invb(T1 b, T2 x, T3 p);
+   BOOST_MATH_GPU_ENABLED``__sf_result`` ibetac_invb(T1 b, T2 x, T3 p);
    
    template <class T1, class T2, class T3, class ``__Policy``>
-   ``__sf_result`` ibetac_invb(T1 b, T2 x, T3 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` ibetac_invb(T1 b, T2 x, T3 p, const ``__Policy``&);
    
 Returns a value /b/ such that: `q = ibetac(a, b, x);`
 
diff --git a/doc/sf/igamma.qbk b/doc/sf/igamma.qbk
index ca354ad10..4675928e6 100644
--- a/doc/sf/igamma.qbk
+++ b/doc/sf/igamma.qbk
@@ -9,28 +9,28 @@
    namespace boost{ namespace math{
    
    template <class T1, class T2>
-   ``__sf_result`` gamma_p(T1 a, T2 z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p(T1 a, T2 z);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_p(T1 a, T2 z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p(T1 a, T2 z, const ``__Policy``&);
    
    template <class T1, class T2>
-   ``__sf_result`` gamma_q(T1 a, T2 z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q(T1 a, T2 z);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_q(T1 a, T2 z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q(T1 a, T2 z, const ``__Policy``&);
    
    template <class T1, class T2>
-   ``__sf_result`` tgamma_lower(T1 a, T2 z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_lower(T1 a, T2 z);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` tgamma_lower(T1 a, T2 z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_lower(T1 a, T2 z, const ``__Policy``&);
    
    template <class T1, class T2>
-   ``__sf_result`` tgamma(T1 a, T2 z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T1 a, T2 z);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` tgamma(T1 a, T2 z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T1 a, T2 z, const ``__Policy``&);
    
    }} // namespaces
    
@@ -53,10 +53,10 @@ The return type of these functions is computed using the __arg_promotion_rules
 when T1 and T2 are different types, otherwise the return type is simply T1.
 
    template <class T1, class T2>
-   ``__sf_result`` gamma_p(T1 a, T2 z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p(T1 a, T2 z);
    
    template <class T1, class T2, class Policy>
-   ``__sf_result`` gamma_p(T1 a, T2 z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p(T1 a, T2 z, const ``__Policy``&);
    
 Returns the normalised lower incomplete gamma function of a and z:
 
@@ -67,10 +67,10 @@ This function changes rapidly from 0 to 1 around the point z == a:
 [graph gamma_p]
 
    template <class T1, class T2>
-   ``__sf_result`` gamma_q(T1 a, T2 z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q(T1 a, T2 z);
 
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_q(T1 a, T2 z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q(T1 a, T2 z, const ``__Policy``&);
 
 Returns the normalised upper incomplete gamma function of a and z:
 
@@ -81,20 +81,20 @@ This function changes rapidly from 1 to 0 around the point z == a:
 [graph gamma_q]
 
    template <class T1, class T2>
-   ``__sf_result`` tgamma_lower(T1 a, T2 z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_lower(T1 a, T2 z);
 
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` tgamma_lower(T1 a, T2 z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma_lower(T1 a, T2 z, const ``__Policy``&);
 
 Returns the full (non-normalised) lower incomplete gamma function of a and z:
 
 [equation igamma2]
 
    template <class T1, class T2>
-   ``__sf_result`` tgamma(T1 a, T2 z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T1 a, T2 z);
 
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` tgamma(T1 a, T2 z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T1 a, T2 z, const ``__Policy``&);
 
 Returns the full (non-normalised) upper incomplete gamma function of a and z:
 
diff --git a/doc/sf/igamma_inv.qbk b/doc/sf/igamma_inv.qbk
index 593c92141..55fe76e6e 100644
--- a/doc/sf/igamma_inv.qbk
+++ b/doc/sf/igamma_inv.qbk
@@ -9,28 +9,28 @@
    namespace boost{ namespace math{
    
    template <class T1, class T2>
-   ``__sf_result`` gamma_q_inv(T1 a, T2 q);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inv(T1 a, T2 q);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_q_inv(T1 a, T2 q, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inv(T1 a, T2 q, const ``__Policy``&);
    
    template <class T1, class T2>
-   ``__sf_result`` gamma_p_inv(T1 a, T2 p);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inv(T1 a, T2 p);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_p_inv(T1 a, T2 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inv(T1 a, T2 p, const ``__Policy``&);
    
    template <class T1, class T2>
-   ``__sf_result`` gamma_q_inva(T1 x, T2 q);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inva(T1 x, T2 q);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_q_inva(T1 x, T2 q, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inva(T1 x, T2 q, const ``__Policy``&);
    
    template <class T1, class T2>
-   ``__sf_result`` gamma_p_inva(T1 x, T2 p);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inva(T1 x, T2 p);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_p_inva(T1 x, T2 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inva(T1 x, T2 p, const ``__Policy``&);
    
    }} // namespaces
    
@@ -58,40 +58,40 @@ These are implemented here as `gamma_p_inva` and `gamma_q_inva`.]
 
 
    template <class T1, class T2>
-   ``__sf_result`` gamma_q_inv(T1 a, T2 q);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inv(T1 a, T2 q);
 
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_q_inv(T1 a, T2 q, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inv(T1 a, T2 q, const ``__Policy``&);
 
 Returns a value x such that: `q = gamma_q(a, x);`
 
 Requires: /a > 0/ and /1 >= p,q >= 0/.
 
    template <class T1, class T2>
-   ``__sf_result`` gamma_p_inv(T1 a, T2 p);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inv(T1 a, T2 p);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_p_inv(T1 a, T2 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inv(T1 a, T2 p, const ``__Policy``&);
    
 Returns a value x such that: `p = gamma_p(a, x);`
 
 Requires: /a > 0/ and /1 >= p,q >= 0/.
 
    template <class T1, class T2>
-   ``__sf_result`` gamma_q_inva(T1 x, T2 q);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inva(T1 x, T2 q);
 
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_q_inva(T1 x, T2 q, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_q_inva(T1 x, T2 q, const ``__Policy``&);
 
 Returns a value a such that: `q = gamma_q(a, x);`
 
 Requires: /x > 0/ and /1 >= p,q >= 0/.
 
    template <class T1, class T2>
-   ``__sf_result`` gamma_p_inva(T1 x, T2 p);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inva(T1 x, T2 p);
    
    template <class T1, class T2, class ``__Policy``>
-   ``__sf_result`` gamma_p_inva(T1 x, T2 p, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` gamma_p_inva(T1 x, T2 p, const ``__Policy``&);
    
 Returns a value a such that: `p = gamma_p(a, x);`
 
diff --git a/doc/sf/lgamma.qbk b/doc/sf/lgamma.qbk
index 5ea1a4e09..544485c7c 100644
--- a/doc/sf/lgamma.qbk
+++ b/doc/sf/lgamma.qbk
@@ -9,16 +9,16 @@
    namespace boost{ namespace math{
    
    template <class T>
-   ``__sf_result`` lgamma(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` lgamma(T z);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` lgamma(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` lgamma(T z, const ``__Policy``&);
    
    template <class T>
-   ``__sf_result`` lgamma(T z, int* sign);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` lgamma(T z, int* sign);
    
    template <class T, class ``__Policy``>
-   ``__sf_result`` lgamma(T z, int* sign, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` lgamma(T z, int* sign, const ``__Policy``&);
    
    }} // namespaces
 
diff --git a/doc/sf/pow.qbk b/doc/sf/pow.qbk
index db021978e..ecb762d71 100644
--- a/doc/sf/pow.qbk
+++ b/doc/sf/pow.qbk
@@ -10,10 +10,10 @@ power of a run-time base.
     namespace boost { namespace math {
 
     template <int N, typename T>
-    constexpr ``__sf_result`` pow(T base);
+    BOOST_MATH_GPU_ENABLED constexpr ``__sf_result`` pow(T base);
 
     template <int N, typename T, class Policy>
-    constexpr ``__sf_result`` pow(T base, const Policy& policy);
+    BOOST_MATH_GPU_ENABLED constexpr ``__sf_result`` pow(T base, const Policy& policy);
 
     }}
 
diff --git a/doc/sf/sinc.qbk b/doc/sf/sinc.qbk
index b345c08cd..a6042a717 100644
--- a/doc/sf/sinc.qbk
+++ b/doc/sf/sinc.qbk
@@ -43,16 +43,16 @@ and [@http://mathworld.wolfram.com/Octonion.html octonions].
 ``
 
    template<class T> 
-   ``__sf_result`` sinc_pi(const T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` sinc_pi(const T x);
 
    template<class T, class ``__Policy``> 
-   ``__sf_result`` sinc_pi(const T x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` sinc_pi(const T x, const ``__Policy``&);
 
    template<class T, template<typename> class U> 
-   U<T> sinc_pi(const U<T> x);
+   BOOST_MATH_GPU_ENABLED U<T> sinc_pi(const U<T> x);
 
    template<class T, template<typename> class U, class ``__Policy``> 
-   U<T> sinc_pi(const U<T> x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED U<T> sinc_pi(const U<T> x, const ``__Policy``&);
 
 Computes 
 [link math_toolkit.sinc.sinc_overview 
@@ -78,10 +78,10 @@ to ensure accuracy.
 ``
 
    template<class T> 
-   ``__sf_result`` sinhc_pi(const T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` sinhc_pi(const T x);
 
    template<class T, class ``__Policy``> 
-   ``__sf_result`` sinhc_pi(const T x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` sinhc_pi(const T x, const ``__Policy``&);
 
    template<typename T, template<typename> class U> 
    U<T> sinhc_pi(const U<T> x);
diff --git a/doc/sf/tgamma.qbk b/doc/sf/tgamma.qbk
index 7eb535ec3..23baad2cb 100644
--- a/doc/sf/tgamma.qbk
+++ b/doc/sf/tgamma.qbk
@@ -9,26 +9,26 @@
   namespace boost{ namespace math{
   
   template <class T>
-  ``__sf_result`` tgamma(T z);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T z);
   
   template <class T, class ``__Policy``>
-  ``__sf_result`` tgamma(T z, const ``__Policy``&);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T z, const ``__Policy``&);
   
   template <class T>
-  ``__sf_result`` tgamma1pm1(T dz);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma1pm1(T dz);
   
   template <class T, class ``__Policy``>
-  ``__sf_result`` tgamma1pm1(T dz, const ``__Policy``&);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma1pm1(T dz, const ``__Policy``&);
   
   }} // namespaces
   
 [h4 Description]
 
   template <class T>
-  ``__sf_result`` tgamma(T z);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T z);
   
   template <class T, class ``__Policy``>
-  ``__sf_result`` tgamma(T z, const ``__Policy``&);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma(T z, const ``__Policy``&);
   
 Returns the "true gamma" (hence name tgamma) of value z:
 
@@ -42,10 +42,10 @@ The return type of this function is computed using the __arg_promotion_rules:
 the result is `double` when T is an integer type, and T otherwise.
 
   template <class T>
-  ``__sf_result`` tgamma1pm1(T dz);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma1pm1(T dz);
   
   template <class T, class ``__Policy``>
-  ``__sf_result`` tgamma1pm1(T dz, const ``__Policy``&);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` tgamma1pm1(T dz, const ``__Policy``&);
   
 Returns `tgamma(dz + 1) - 1`.  Internally the implementation does not make
 use of the addition and subtraction implied by the definition, leading to
diff --git a/doc/sf/trigamma.qbk b/doc/sf/trigamma.qbk
index 137a148d8..a358c8571 100644
--- a/doc/sf/trigamma.qbk
+++ b/doc/sf/trigamma.qbk
@@ -9,10 +9,10 @@
   namespace boost{ namespace math{
   
   template <class T>
-  ``__sf_result`` trigamma(T x);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` trigamma(T x);
   
   template <class T, class ``__Policy``>
-  ``__sf_result`` trigamma(T x, const ``__Policy``&);
+  BOOST_MATH_GPU_ENABLED ``__sf_result`` trigamma(T x, const ``__Policy``&);
   
   }} // namespaces
   

From 2729683b09b7243f33f21ede21f10bde5d4fd811 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 30 Aug 2024 16:11:45 -0400
Subject: [PATCH 05/31] Add markers to Newton-Rhapson

---
 doc/roots/roots.qbk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/roots/roots.qbk b/doc/roots/roots.qbk
index a22930069..ee3619f4f 100644
--- a/doc/roots/roots.qbk
+++ b/doc/roots/roots.qbk
@@ -10,10 +10,10 @@
    namespace tools { // Note namespace boost::math::tools.
    // Newton-Raphson
    template <class F, class T>
-   T newton_raphson_iterate(F f, T guess, T min, T max, int digits);
+   BOOST_MATH_GPU_ENABLED T newton_raphson_iterate(F f, T guess, T min, T max, int digits);
 
    template <class F, class T>
-   T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& max_iter);
+   BOOST_MATH_GPU_ENABLED T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& max_iter);
 
    // Halley
    template <class F, class T>

From b13fcb07c13a62dc2461563808866c452a727c56 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 30 Aug 2024 16:14:06 -0400
Subject: [PATCH 06/31] Replace broken umalut-o with oe (Ersatzschreibung)

---
 doc/roots/roots.qbk | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/roots/roots.qbk b/doc/roots/roots.qbk
index ee3619f4f..ea347639b 100644
--- a/doc/roots/roots.qbk
+++ b/doc/roots/roots.qbk
@@ -1,4 +1,4 @@
-[section:roots_deriv Root Finding With Derivatives: Newton-Raphson, Halley & Schr'''&#xf6;'''der]
+[section:roots_deriv Root Finding With Derivatives: Newton-Raphson, Halley & Schroeder]
 
 [h4 Synopsis]
 
@@ -22,7 +22,7 @@
    template <class F, class T>
    T halley_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& max_iter);
 
-   // Schr'''&#xf6;'''der
+   // Schroeder
    template <class F, class T>
    T schroder_iterate(F f, T guess, T min, T max, int digits);
 
@@ -61,7 +61,7 @@ For second-order iterative method ([@http://en.wikipedia.org/wiki/Newton_Raphson
 
 For the third-order methods
 ([@http://en.wikipedia.org/wiki/Halley%27s_method Halley] and
-Schr'''&#xf6;'''der)
+Schroeder)
         the `tuple` should have [*three] elements containing the evaluation of
         the function and its first and second derivatives.]]
 [[T guess] [The initial starting value. A good guess is crucial to quick convergence!]]
@@ -147,7 +147,7 @@ Out of bounds steps revert to bisection of the current bounds.
 
 Under ideal conditions, the number of correct digits trebles with each iteration.
 
-[h4:schroder Schr'''&#xf6;'''der's Method]
+[h4:schroder Schroeder's Method]
 
 Given an initial guess x0 the subsequent values are computed using:
 
@@ -162,8 +162,8 @@ Out of bounds steps revert to __bisection_wikipedia of the current bounds.
 
 Under ideal conditions, the number of correct digits trebles with each iteration.
 
-This is Schr'''&#xf6;'''der's general result (equation 18 from [@http://drum.lib.umd.edu/handle/1903/577 Stewart, G. W.
-"On Infinitely Many Algorithms for Solving Equations." English translation of Schr'''&#xf6;'''der's original paper.
+This is Schroeder's general result (equation 18 from [@http://drum.lib.umd.edu/handle/1903/577 Stewart, G. W.
+"On Infinitely Many Algorithms for Solving Equations." English translation of Schroeder's original paper.
 College Park, MD: University of Maryland, Institute for Advanced Computer Studies, Department of Computer Science, 1993].)
 
 This method guarantees at least quadratic convergence (the same as Newton's method), and is known to work well in the presence of multiple roots:

From db0dfc7531e959f0628cb84a2bf4fee45c6f1acc Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 10:33:56 -0400
Subject: [PATCH 07/31] Fix missing end section

---
 doc/overview/gpu.qbk | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/overview/gpu.qbk b/doc/overview/gpu.qbk
index 18ebaba2a..b97b059a6 100644
--- a/doc/overview/gpu.qbk
+++ b/doc/overview/gpu.qbk
@@ -55,6 +55,8 @@ And lastly on SYCL:
 
 Once your kernel function has been written then use the framework mechanism for launching the kernel.
 
+[endsect] [/section:gpu Support for GPU programming in Boost.Math]
+
 [/ 
   Copyright 2024. Matt Borland
   Distributed under the Boost Software License, Version 1.0.

From 81cf65ccd6ebb4c0c4303a8b5efaa9671cb64808 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 10:48:07 -0400
Subject: [PATCH 08/31] Add GPU markers to fisher f dist

---
 include/boost/math/distributions/fisher_f.hpp | 69 ++++++++++---------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/include/boost/math/distributions/fisher_f.hpp b/include/boost/math/distributions/fisher_f.hpp
index e22cdf50a..56b288d88 100644
--- a/include/boost/math/distributions/fisher_f.hpp
+++ b/include/boost/math/distributions/fisher_f.hpp
@@ -1,5 +1,5 @@
 // Copyright John Maddock 2006.
-
+// Copyright Matt Borland 2024.
 // Use, modification and distribution are subject to the
 // Boost Software License, Version 1.0.
 // (See accompanying file LICENSE_1_0.txt
@@ -8,14 +8,15 @@
 #ifndef BOOST_MATH_DISTRIBUTIONS_FISHER_F_HPP
 #define BOOST_MATH_DISTRIBUTIONS_FISHER_F_HPP
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/tools/promotion.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/beta.hpp> // for incomplete beta.
 #include <boost/math/distributions/complement.hpp> // complements
 #include <boost/math/distributions/detail/common_error_handling.hpp> // error checks
 #include <boost/math/special_functions/fpclassify.hpp>
 
-#include <utility>
-
 namespace boost{ namespace math{
 
 template <class RealType = double, class Policy = policies::policy<> >
@@ -25,9 +26,9 @@ class fisher_f_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   fisher_f_distribution(const RealType& i, const RealType& j) : m_df1(i), m_df2(j)
+   BOOST_MATH_GPU_ENABLED fisher_f_distribution(const RealType& i, const RealType& j) : m_df1(i), m_df2(j)
    {
-      static const char* function = "fisher_f_distribution<%1%>::fisher_f_distribution";
+      constexpr auto function = "fisher_f_distribution<%1%>::fisher_f_distribution";
       RealType result;
       detail::check_df(
          function, m_df1, &result, Policy());
@@ -35,11 +36,11 @@ class fisher_f_distribution
          function, m_df2, &result, Policy());
    } // fisher_f_distribution
 
-   RealType degrees_of_freedom1()const
+   BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom1()const
    {
       return m_df1;
    }
-   RealType degrees_of_freedom2()const
+   BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom2()const
    {
       return m_df2;
    }
@@ -60,29 +61,29 @@ fisher_f_distribution(RealType,RealType)->fisher_f_distribution<typename boost::
 #endif
 
 template <class RealType, class Policy>
-inline const std::pair<RealType, RealType> range(const fisher_f_distribution<RealType, Policy>& /*dist*/)
+BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> range(const fisher_f_distribution<RealType, Policy>& /*dist*/)
 { // Range of permissible values for random variable x.
    using boost::math::tools::max_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-inline const std::pair<RealType, RealType> support(const fisher_f_distribution<RealType, Policy>& /*dist*/)
+BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> support(const fisher_f_distribution<RealType, Policy>& /*dist*/)
 { // Range of supported values for random variable x.
    // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
    using boost::math::tools::max_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-RealType pdf(const fisher_f_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED RealType pdf(const fisher_f_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
    RealType error_result = 0;
-   static const char* function = "boost::math::pdf(fisher_f_distribution<%1%> const&, %1%)";
+   constexpr auto function = "boost::math::pdf(fisher_f_distribution<%1%> const&, %1%)";
    if(false == (detail::check_df(
          function, df1, &error_result, Policy())
          && detail::check_df(
@@ -132,9 +133,9 @@ RealType pdf(const fisher_f_distribution<RealType, Policy>& dist, const RealType
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const fisher_f_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const fisher_f_distribution<RealType, Policy>& dist, const RealType& x)
 {
-   static const char* function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)";
+   constexpr auto function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)";
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
@@ -167,9 +168,9 @@ inline RealType cdf(const fisher_f_distribution<RealType, Policy>& dist, const R
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const fisher_f_distribution<RealType, Policy>& dist, const RealType& p)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const fisher_f_distribution<RealType, Policy>& dist, const RealType& p)
 {
-   static const char* function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)";
+   constexpr auto function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)";
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
@@ -192,9 +193,9 @@ inline RealType quantile(const fisher_f_distribution<RealType, Policy>& dist, co
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<fisher_f_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<fisher_f_distribution<RealType, Policy>, RealType>& c)
 {
-   static const char* function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)";
+   constexpr auto function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)";
    RealType df1 = c.dist.degrees_of_freedom1();
    RealType df2 = c.dist.degrees_of_freedom2();
    RealType x = c.param;
@@ -228,9 +229,9 @@ inline RealType cdf(const complemented2_type<fisher_f_distribution<RealType, Pol
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<fisher_f_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<fisher_f_distribution<RealType, Policy>, RealType>& c)
 {
-   static const char* function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)";
+   constexpr auto function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)";
    RealType df1 = c.dist.degrees_of_freedom1();
    RealType df2 = c.dist.degrees_of_freedom2();
    RealType p = c.param;
@@ -252,9 +253,9 @@ inline RealType quantile(const complemented2_type<fisher_f_distribution<RealType
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mean(const fisher_f_distribution<RealType, Policy>& dist)
 { // Mean of F distribution = v.
-   static const char* function = "boost::math::mean(fisher_f_distribution<%1%> const&)";
+   constexpr auto function = "boost::math::mean(fisher_f_distribution<%1%> const&)";
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
@@ -273,9 +274,9 @@ inline RealType mean(const fisher_f_distribution<RealType, Policy>& dist)
 } // mean
 
 template <class RealType, class Policy>
-inline RealType variance(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType variance(const fisher_f_distribution<RealType, Policy>& dist)
 { // Variance of F distribution.
-   static const char* function = "boost::math::variance(fisher_f_distribution<%1%> const&)";
+   constexpr auto function = "boost::math::variance(fisher_f_distribution<%1%> const&)";
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
@@ -294,9 +295,9 @@ inline RealType variance(const fisher_f_distribution<RealType, Policy>& dist)
 } // variance
 
 template <class RealType, class Policy>
-inline RealType mode(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mode(const fisher_f_distribution<RealType, Policy>& dist)
 {
-   static const char* function = "boost::math::mode(fisher_f_distribution<%1%> const&)";
+   constexpr auto function = "boost::math::mode(fisher_f_distribution<%1%> const&)";
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
@@ -317,15 +318,15 @@ inline RealType mode(const fisher_f_distribution<RealType, Policy>& dist)
 //template <class RealType, class Policy>
 //inline RealType median(const fisher_f_distribution<RealType, Policy>& dist)
 //{ // Median of Fisher F distribution is not defined.
-//  return tools::domain_error<RealType>(BOOST_CURRENT_FUNCTION, "Median is not implemented, result is %1%!", std::numeric_limits<RealType>::quiet_NaN());
+//  return tools::domain_error<RealType>(BOOST_CURRENT_FUNCTION, "Median is not implemented, result is %1%!", boost::math::numeric_limits<RealType>::quiet_NaN());
 //  } // median
 
 // Now implemented via quantile(half) in derived accessors.
 
 template <class RealType, class Policy>
-inline RealType skewness(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType skewness(const fisher_f_distribution<RealType, Policy>& dist)
 {
-   static const char* function = "boost::math::skewness(fisher_f_distribution<%1%> const&)";
+   constexpr auto function = "boost::math::skewness(fisher_f_distribution<%1%> const&)";
    BOOST_MATH_STD_USING // ADL of std names
    // See http://mathworld.wolfram.com/F-Distribution.html
    RealType df1 = dist.degrees_of_freedom1();
@@ -346,18 +347,18 @@ inline RealType skewness(const fisher_f_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-RealType kurtosis_excess(const fisher_f_distribution<RealType, Policy>& dist);
+BOOST_MATH_GPU_ENABLED RealType kurtosis_excess(const fisher_f_distribution<RealType, Policy>& dist);
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const fisher_f_distribution<RealType, Policy>& dist)
 {
    return 3 + kurtosis_excess(dist);
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const fisher_f_distribution<RealType, Policy>& dist)
 {
-   static const char* function = "boost::math::kurtosis_excess(fisher_f_distribution<%1%> const&)";
+   constexpr auto function = "boost::math::kurtosis_excess(fisher_f_distribution<%1%> const&)";
    // See http://mathworld.wolfram.com/F-Distribution.html
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();

From 0882eccc76e3932e83c16516dbcdad39e7879137 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 10:48:17 -0400
Subject: [PATCH 09/31] Add SYCL testing of fisher f dist

---
 test/sycl_jamfile      | 1 +
 test/test_fisher_f.cpp | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index 5d3d85cd8..baf2f95d2 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -17,6 +17,7 @@ run test_cauchy.cpp ;
 run test_chi_squared.cpp ;
 run test_exponential_dist.cpp ;
 run test_extreme_value.cpp ;
+run test_fisher_f.cpp ;
 run test_holtsmark.cpp ;
 run test_landau.cpp ;
 run test_laplace.cpp ;
diff --git a/test/test_fisher_f.cpp b/test/test_fisher_f.cpp
index c18ed8ff1..f142a3327 100644
--- a/test/test_fisher_f.cpp
+++ b/test/test_fisher_f.cpp
@@ -8,9 +8,13 @@
 // (See accompanying file LICENSE_1_0.txt
 // or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#include <boost/math/tools/test.hpp>
+#include <boost/math/tools/config.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
 using ::boost::math::concepts::real_concept;
+#endif
 
 #include <boost/math/distributions/fisher_f.hpp> // for fisher_f_distribution
 using boost::math::fisher_f_distribution;

From 438c2546eaee899a9239bff2ef57accf8a944ac1 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 11:05:57 -0400
Subject: [PATCH 10/31] Add CUDA fisher f dist testing

---
 test/cuda_jamfile                 |   7 ++
 test/test_fisher_f_cdf_double.cu  | 109 ++++++++++++++++++++++++++++++
 test/test_fisher_f_cdf_float.cu   | 109 ++++++++++++++++++++++++++++++
 test/test_fisher_f_pdf_double.cu  | 109 ++++++++++++++++++++++++++++++
 test/test_fisher_f_pdf_float.cu   | 109 ++++++++++++++++++++++++++++++
 test/test_fisher_f_quan_double.cu | 109 ++++++++++++++++++++++++++++++
 test/test_fisher_f_quan_float.cu  | 109 ++++++++++++++++++++++++++++++
 7 files changed, 661 insertions(+)
 create mode 100644 test/test_fisher_f_cdf_double.cu
 create mode 100644 test/test_fisher_f_cdf_float.cu
 create mode 100644 test/test_fisher_f_pdf_double.cu
 create mode 100644 test/test_fisher_f_pdf_float.cu
 create mode 100644 test/test_fisher_f_quan_double.cu
 create mode 100644 test/test_fisher_f_quan_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index c697da8e9..f517f4257 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -65,6 +65,13 @@ run test_extreme_value_pdf_float.cu ;
 run test_extreme_value_quan_double.cu ;
 run test_extreme_value_quan_float.cu ;
 
+run test_fisher_f_cdf_double.cu ;
+run test_fisher_f_cdf_float.cu ;
+run test_fisher_f_pdf_double.cu ;
+run test_fisher_f_pdf_float.cu ;
+run test_fisher_f_quan_double.cu ;
+run test_fisher_f_quan_float.cu ;
+
 run test_holtsmark_cdf_double.cu ;
 run test_holtsmark_cdf_float.cu ;
 run test_holtsmark_pdf_double.cu ;
diff --git a/test/test_fisher_f_cdf_double.cu b/test/test_fisher_f_cdf_double.cu
new file mode 100644
index 000000000..877961166
--- /dev/null
+++ b/test/test_fisher_f_cdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_cdf_float.cu b/test/test_fisher_f_cdf_float.cu
new file mode 100644
index 000000000..a6fcc9f98
--- /dev/null
+++ b/test/test_fisher_f_cdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_pdf_double.cu b/test/test_fisher_f_pdf_double.cu
new file mode 100644
index 000000000..e4ae50791
--- /dev/null
+++ b/test/test_fisher_f_pdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_pdf_float.cu b/test/test_fisher_f_pdf_float.cu
new file mode 100644
index 000000000..7b7583736
--- /dev/null
+++ b/test/test_fisher_f_pdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_quan_double.cu b/test/test_fisher_f_quan_double.cu
new file mode 100644
index 000000000..42bcb0dac
--- /dev/null
+++ b/test/test_fisher_f_quan_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_quan_float.cu b/test/test_fisher_f_quan_float.cu
new file mode 100644
index 000000000..3a0bc688b
--- /dev/null
+++ b/test/test_fisher_f_quan_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file

From ee54ae0a93eeae2e3a0e8ab685eee73cc997986f Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 11:25:00 -0400
Subject: [PATCH 11/31] Add NVRTC fisher f dist testing

---
 test/nvrtc_jamfile                       |   7 +
 test/test_fisher_f_cdf_nvrtc_double.cpp  | 191 +++++++++++++++++++++++
 test/test_fisher_f_cdf_nvrtc_float.cpp   | 191 +++++++++++++++++++++++
 test/test_fisher_f_pdf_nvrtc_double.cpp  | 191 +++++++++++++++++++++++
 test/test_fisher_f_pdf_nvrtc_float.cpp   | 191 +++++++++++++++++++++++
 test/test_fisher_f_quan_nvrtc_double.cpp | 191 +++++++++++++++++++++++
 test/test_fisher_f_quan_nvrtc_float.cpp  | 191 +++++++++++++++++++++++
 7 files changed, 1153 insertions(+)
 create mode 100644 test/test_fisher_f_cdf_nvrtc_double.cpp
 create mode 100644 test/test_fisher_f_cdf_nvrtc_float.cpp
 create mode 100644 test/test_fisher_f_pdf_nvrtc_double.cpp
 create mode 100644 test/test_fisher_f_pdf_nvrtc_float.cpp
 create mode 100644 test/test_fisher_f_quan_nvrtc_double.cpp
 create mode 100644 test/test_fisher_f_quan_nvrtc_float.cpp

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 1fc2746a1..438e41c88 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -59,6 +59,13 @@ run test_extreme_value_pdf_nvrtc_float.cpp ;
 run test_extreme_value_quan_nvrtc_double.cpp ;
 run test_extreme_value_quan_nvrtc_float.cpp ;
 
+run test_fisher_f_cdf_nvrtc_double.cpp ;
+run test_fisher_f_cdf_nvrtc_float.cpp ;
+run test_fisher_f_pdf_nvrtc_double.cpp ;
+run test_fisher_f_pdf_nvrtc_float.cpp ;
+run test_fisher_f_quan_nvrtc_double.cpp ;
+run test_fisher_f_quan_nvrtc_float.cpp ;
+
 run test_holtsmark_cdf_nvrtc_double.cpp ;
 run test_holtsmark_cdf_nvrtc_float.cpp ;
 run test_holtsmark_pdf_nvrtc_double.cpp ;
diff --git a/test/test_fisher_f_cdf_nvrtc_double.cpp b/test/test_fisher_f_cdf_nvrtc_double.cpp
new file mode 100644
index 000000000..1eb9cb00f
--- /dev/null
+++ b/test/test_fisher_f_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_fisher_f_cdf_nvrtc_float.cpp b/test/test_fisher_f_cdf_nvrtc_float.cpp
new file mode 100644
index 000000000..244190cf1
--- /dev/null
+++ b/test/test_fisher_f_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_fisher_f_pdf_nvrtc_double.cpp b/test/test_fisher_f_pdf_nvrtc_double.cpp
new file mode 100644
index 000000000..8aa1482aa
--- /dev/null
+++ b/test/test_fisher_f_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_fisher_f_pdf_nvrtc_float.cpp b/test/test_fisher_f_pdf_nvrtc_float.cpp
new file mode 100644
index 000000000..e461dea9a
--- /dev/null
+++ b/test/test_fisher_f_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_fisher_f_quan_nvrtc_double.cpp b/test/test_fisher_f_quan_nvrtc_double.cpp
new file mode 100644
index 000000000..16ad0cbc0
--- /dev/null
+++ b/test/test_fisher_f_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_fisher_f_quan_nvrtc_float.cpp b/test/test_fisher_f_quan_nvrtc_float.cpp
new file mode 100644
index 000000000..377048e52
--- /dev/null
+++ b/test/test_fisher_f_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From d573ab35cf2230f0b80473b6e1a10fbe9b004ec7 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 11:33:56 -0400
Subject: [PATCH 12/31] Add GPU support to gamma dist

---
 include/boost/math/distributions/gamma.hpp | 84 +++++++++++-----------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/include/boost/math/distributions/gamma.hpp b/include/boost/math/distributions/gamma.hpp
index 28b7c55b0..5176f906d 100644
--- a/include/boost/math/distributions/gamma.hpp
+++ b/include/boost/math/distributions/gamma.hpp
@@ -1,4 +1,5 @@
 //  Copyright John Maddock 2006.
+//  Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,22 +11,22 @@
 // http://mathworld.wolfram.com/GammaDistribution.html
 // http://en.wikipedia.org/wiki/Gamma_distribution
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/gamma.hpp>
 #include <boost/math/special_functions/digamma.hpp>
 #include <boost/math/distributions/detail/common_error_handling.hpp>
 #include <boost/math/distributions/complement.hpp>
 
-#include <utility>
-#include <type_traits>
-
 namespace boost{ namespace math
 {
 namespace detail
 {
 
 template <class RealType, class Policy>
-inline bool check_gamma_shape(
+BOOST_MATH_GPU_ENABLED inline bool check_gamma_shape(
       const char* function,
       RealType shape,
       RealType* result, const Policy& pol)
@@ -41,7 +42,7 @@ inline bool check_gamma_shape(
 }
 
 template <class RealType, class Policy>
-inline bool check_gamma_x(
+BOOST_MATH_GPU_ENABLED inline bool check_gamma_x(
       const char* function,
       RealType const& x,
       RealType* result, const Policy& pol)
@@ -57,7 +58,7 @@ inline bool check_gamma_x(
 }
 
 template <class RealType, class Policy>
-inline bool check_gamma(
+BOOST_MATH_GPU_ENABLED inline bool check_gamma(
       const char* function,
       RealType scale,
       RealType shape,
@@ -75,19 +76,19 @@ class gamma_distribution
    using value_type = RealType;
    using policy_type = Policy;
 
-   explicit gamma_distribution(RealType l_shape, RealType l_scale = 1)
+   BOOST_MATH_GPU_ENABLED explicit gamma_distribution(RealType l_shape, RealType l_scale = 1)
       : m_shape(l_shape), m_scale(l_scale)
    {
       RealType result;
       detail::check_gamma("boost::math::gamma_distribution<%1%>::gamma_distribution", l_scale, l_shape, &result, Policy());
    }
 
-   RealType shape()const
+   BOOST_MATH_GPU_ENABLED RealType shape()const
    {
       return m_shape;
    }
 
-   RealType scale()const
+   BOOST_MATH_GPU_ENABLED RealType scale()const
    {
       return m_scale;
    }
@@ -109,27 +110,27 @@ gamma_distribution(RealType,RealType)->gamma_distribution<typename boost::math::
 #endif
 
 template <class RealType, class Policy>
-inline std::pair<RealType, RealType> range(const gamma_distribution<RealType, Policy>& /* dist */)
+BOOST_MATH_GPU_ENABLED inline boost::math::pair<RealType, RealType> range(const gamma_distribution<RealType, Policy>& /* dist */)
 { // Range of permissible values for random variable x.
    using boost::math::tools::max_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-inline std::pair<RealType, RealType> support(const gamma_distribution<RealType, Policy>& /* dist */)
+BOOST_MATH_GPU_ENABLED inline boost::math::pair<RealType, RealType> support(const gamma_distribution<RealType, Policy>& /* dist */)
 { // Range of supported values for random variable x.
    // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
    using boost::math::tools::max_value;
    using boost::math::tools::min_value;
-   return std::pair<RealType, RealType>(min_value<RealType>(),  max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(min_value<RealType>(),  max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType pdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::pdf(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::pdf(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -149,17 +150,17 @@ inline RealType pdf(const gamma_distribution<RealType, Policy>& dist, const Real
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType logpdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType logpdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
    using boost::math::lgamma;
 
-   static const char* function = "boost::math::logpdf(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::logpdf(const gamma_distribution<%1%>&, %1%)";
 
    RealType k = dist.shape();
    RealType theta = dist.scale();
 
-   RealType result = -std::numeric_limits<RealType>::infinity();
+   RealType result = -boost::math::numeric_limits<RealType>::infinity();
    if(false == detail::check_gamma(function, theta, k, &result, Policy()))
       return result;
    if(false == detail::check_gamma_x(function, x, &result, Policy()))
@@ -167,7 +168,7 @@ inline RealType logpdf(const gamma_distribution<RealType, Policy>& dist, const R
 
    if(x == 0)
    {
-      return std::numeric_limits<RealType>::quiet_NaN();
+      return boost::math::numeric_limits<RealType>::quiet_NaN();
    }
 
    result = -k*log(theta) + (k-1)*log(x) - lgamma(k) - (x/theta);
@@ -176,11 +177,11 @@ inline RealType logpdf(const gamma_distribution<RealType, Policy>& dist, const R
 } // logpdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::cdf(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -196,11 +197,11 @@ inline RealType cdf(const gamma_distribution<RealType, Policy>& dist, const Real
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const gamma_distribution<RealType, Policy>& dist, const RealType& p)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const gamma_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -220,11 +221,11 @@ inline RealType quantile(const gamma_distribution<RealType, Policy>& dist, const
 }
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<gamma_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<gamma_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -241,11 +242,11 @@ inline RealType cdf(const complemented2_type<gamma_distribution<RealType, Policy
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<gamma_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<gamma_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -266,11 +267,11 @@ inline RealType quantile(const complemented2_type<gamma_distribution<RealType, P
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mean(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mean(const gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mean(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -284,11 +285,11 @@ inline RealType mean(const gamma_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType variance(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType variance(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::variance(const gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::variance(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -302,11 +303,11 @@ inline RealType variance(const gamma_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mode(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mode(const gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mode(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -331,11 +332,11 @@ inline RealType mode(const gamma_distribution<RealType, Policy>& dist)
 //}
 
 template <class RealType, class Policy>
-inline RealType skewness(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType skewness(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::skewness(const gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::skewness(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -349,11 +350,11 @@ inline RealType skewness(const gamma_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::kurtosis_excess(const gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::kurtosis_excess(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -367,18 +368,19 @@ inline RealType kurtosis_excess(const gamma_distribution<RealType, Policy>& dist
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const gamma_distribution<RealType, Policy>& dist)
 {
    return kurtosis_excess(dist) + 3;
 }
 
 template <class RealType, class Policy>
-inline RealType entropy(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType entropy(const gamma_distribution<RealType, Policy>& dist)
 {
+   BOOST_MATH_STD_USING
+
    RealType k = dist.shape();
    RealType theta = dist.scale();
-   using std::log;
-   return k + log(theta) + lgamma(k) + (1-k)*digamma(k);
+   return k + log(theta) + boost::math::lgamma(k) + (1-k)*digamma(k);
 }
 
 } // namespace math

From 6f9c91e18e8bea043f9ef2ccce3b185b018e99bd Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 11:42:41 -0400
Subject: [PATCH 13/31] Add SYCL testing of gamma dist

---
 test/sycl_jamfile        | 1 +
 test/test_gamma_dist.cpp | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index baf2f95d2..03b130268 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -18,6 +18,7 @@ run test_chi_squared.cpp ;
 run test_exponential_dist.cpp ;
 run test_extreme_value.cpp ;
 run test_fisher_f.cpp ;
+run test_gamma_dist.cpp ;
 run test_holtsmark.cpp ;
 run test_landau.cpp ;
 run test_laplace.cpp ;
diff --git a/test/test_gamma_dist.cpp b/test/test_gamma_dist.cpp
index b7776c79c..2b1a181f3 100644
--- a/test/test_gamma_dist.cpp
+++ b/test/test_gamma_dist.cpp
@@ -15,16 +15,23 @@
 // From MathWorld--A Wolfram Web Resource.
 // http://mathworld.wolfram.com/GammaDistribution.html
 
+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch.hpp> // include directory libs/math/src/tr1/ is needed.
+#endif
+
+#include <boost/math/tools/config.hpp>
 
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
+#endif
+
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp> // Boost.Test
 #include <boost/test/tools/floating_point_comparison.hpp>
 
 #include <boost/math/distributions/gamma.hpp>
     using boost::math::gamma_distribution;
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include "test_out_of_range.hpp"
 
 #include <iostream>

From 49c0190e6eb459e014426205b5a0715acd34b378 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 11:57:15 -0400
Subject: [PATCH 14/31] Add CUDA gamma dist testing

---
 test/cuda_jamfile                   |   7 ++
 test/test_gamma_dist_cdf_double.cu  | 109 ++++++++++++++++++++++++++++
 test/test_gamma_dist_cdf_float.cu   | 109 ++++++++++++++++++++++++++++
 test/test_gamma_dist_pdf_double.cu  | 109 ++++++++++++++++++++++++++++
 test/test_gamma_dist_pdf_float.cu   | 109 ++++++++++++++++++++++++++++
 test/test_gamma_dist_quan_double.cu | 109 ++++++++++++++++++++++++++++
 test/test_gamma_dist_quan_float.cu  | 109 ++++++++++++++++++++++++++++
 7 files changed, 661 insertions(+)
 create mode 100644 test/test_gamma_dist_cdf_double.cu
 create mode 100644 test/test_gamma_dist_cdf_float.cu
 create mode 100644 test/test_gamma_dist_pdf_double.cu
 create mode 100644 test/test_gamma_dist_pdf_float.cu
 create mode 100644 test/test_gamma_dist_quan_double.cu
 create mode 100644 test/test_gamma_dist_quan_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index f517f4257..b01aa8bb1 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -72,6 +72,13 @@ run test_fisher_f_pdf_float.cu ;
 run test_fisher_f_quan_double.cu ;
 run test_fisher_f_quan_float.cu ;
 
+run test_gamma_dist_cdf_double.cu ;
+run test_gamma_dist_cdf_float.cu ;
+run test_gamma_dist_pdf_double.cu ;
+run test_gamma_dist_pdf_float.cu ;
+run test_gamma_dist_quan_double.cu ;
+run test_gamma_dist_quan_float.cu ;
+
 run test_holtsmark_cdf_double.cu ;
 run test_holtsmark_cdf_float.cu ;
 run test_holtsmark_pdf_double.cu ;
diff --git a/test/test_gamma_dist_cdf_double.cu b/test/test_gamma_dist_cdf_double.cu
new file mode 100644
index 000000000..6424850c3
--- /dev/null
+++ b/test/test_gamma_dist_cdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_cdf_float.cu b/test/test_gamma_dist_cdf_float.cu
new file mode 100644
index 000000000..4f2312ccc
--- /dev/null
+++ b/test/test_gamma_dist_cdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_pdf_double.cu b/test/test_gamma_dist_pdf_double.cu
new file mode 100644
index 000000000..2f8bbc5f4
--- /dev/null
+++ b/test/test_gamma_dist_pdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_pdf_float.cu b/test/test_gamma_dist_pdf_float.cu
new file mode 100644
index 000000000..2080f5ccf
--- /dev/null
+++ b/test/test_gamma_dist_pdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_quan_double.cu b/test/test_gamma_dist_quan_double.cu
new file mode 100644
index 000000000..bde18fc36
--- /dev/null
+++ b/test/test_gamma_dist_quan_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_quan_float.cu b/test/test_gamma_dist_quan_float.cu
new file mode 100644
index 000000000..01ce85dfd
--- /dev/null
+++ b/test/test_gamma_dist_quan_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file

From f8565d7b8b209c853f3dd059bf301caed4a1bb44 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 11:57:27 -0400
Subject: [PATCH 15/31] Add NVRTC gamma dist testing

---
 test/nvrtc_jamfile                         |   7 +
 test/test_gamma_dist_cdf_nvrtc_double.cpp  | 191 +++++++++++++++++++++
 test/test_gamma_dist_cdf_nvrtc_float.cpp   | 191 +++++++++++++++++++++
 test/test_gamma_dist_pdf_nvrtc_double.cpp  | 191 +++++++++++++++++++++
 test/test_gamma_dist_pdf_nvrtc_float.cpp   | 191 +++++++++++++++++++++
 test/test_gamma_dist_quan_nvrtc_double.cpp | 191 +++++++++++++++++++++
 test/test_gamma_dist_quan_nvrtc_float.cpp  | 191 +++++++++++++++++++++
 7 files changed, 1153 insertions(+)
 create mode 100644 test/test_gamma_dist_cdf_nvrtc_double.cpp
 create mode 100644 test/test_gamma_dist_cdf_nvrtc_float.cpp
 create mode 100644 test/test_gamma_dist_pdf_nvrtc_double.cpp
 create mode 100644 test/test_gamma_dist_pdf_nvrtc_float.cpp
 create mode 100644 test/test_gamma_dist_quan_nvrtc_double.cpp
 create mode 100644 test/test_gamma_dist_quan_nvrtc_float.cpp

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 438e41c88..94fc6cc9b 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -66,6 +66,13 @@ run test_fisher_f_pdf_nvrtc_float.cpp ;
 run test_fisher_f_quan_nvrtc_double.cpp ;
 run test_fisher_f_quan_nvrtc_float.cpp ;
 
+run test_gamma_dist_cdf_nvrtc_double.cpp ;
+run test_gamma_dist_cdf_nvrtc_float.cpp ;
+run test_gamma_dist_pdf_nvrtc_double.cpp ;
+run test_gamma_dist_pdf_nvrtc_float.cpp ;
+run test_gamma_dist_quan_nvrtc_double.cpp ;
+run test_gamma_dist_quan_nvrtc_float.cpp ;
+
 run test_holtsmark_cdf_nvrtc_double.cpp ;
 run test_holtsmark_cdf_nvrtc_float.cpp ;
 run test_holtsmark_pdf_nvrtc_double.cpp ;
diff --git a/test/test_gamma_dist_cdf_nvrtc_double.cpp b/test/test_gamma_dist_cdf_nvrtc_double.cpp
new file mode 100644
index 000000000..3e911f4e0
--- /dev/null
+++ b/test/test_gamma_dist_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist_cdf_nvrtc_float.cpp b/test/test_gamma_dist_cdf_nvrtc_float.cpp
new file mode 100644
index 000000000..17762d406
--- /dev/null
+++ b/test/test_gamma_dist_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist_pdf_nvrtc_double.cpp b/test/test_gamma_dist_pdf_nvrtc_double.cpp
new file mode 100644
index 000000000..1faae9986
--- /dev/null
+++ b/test/test_gamma_dist_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist_pdf_nvrtc_float.cpp b/test/test_gamma_dist_pdf_nvrtc_float.cpp
new file mode 100644
index 000000000..054ddbbad
--- /dev/null
+++ b/test/test_gamma_dist_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist_quan_nvrtc_double.cpp b/test/test_gamma_dist_quan_nvrtc_double.cpp
new file mode 100644
index 000000000..132efcd6c
--- /dev/null
+++ b/test/test_gamma_dist_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist_quan_nvrtc_float.cpp b/test/test_gamma_dist_quan_nvrtc_float.cpp
new file mode 100644
index 000000000..7749523ab
--- /dev/null
+++ b/test/test_gamma_dist_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From 75e1710bfbbf81b60a929809c9ceba6cb59d4acd Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 11:59:27 -0400
Subject: [PATCH 16/31] Reduce number of threads per block since it can crash
 CI

---
 test/test_arcsine_cdf_double.cu               | 2 +-
 test/test_arcsine_cdf_float.cu                | 2 +-
 test/test_arcsine_pdf_double.cu               | 2 +-
 test/test_arcsine_pdf_float.cu                | 2 +-
 test/test_arcsine_quan_double.cu              | 2 +-
 test/test_arcsine_quan_float.cu               | 2 +-
 test/test_arcsine_range_support_double.cu     | 2 +-
 test/test_arcsine_range_support_float.cu      | 2 +-
 test/test_bernoulli_cdf_double.cu             | 2 +-
 test/test_bernoulli_cdf_float.cu              | 2 +-
 test/test_bernoulli_pdf_double.cu             | 2 +-
 test/test_bernoulli_pdf_float.cu              | 2 +-
 test/test_bernoulli_range_support_double.cu   | 2 +-
 test/test_bernoulli_range_support_float.cu    | 2 +-
 test/test_beta_dist_cdf_double.cu             | 2 +-
 test/test_beta_dist_cdf_float.cu              | 2 +-
 test/test_beta_dist_pdf_double.cu             | 2 +-
 test/test_beta_dist_pdf_float.cu              | 2 +-
 test/test_beta_dist_quan_double.cu            | 2 +-
 test/test_beta_dist_quan_float.cu             | 2 +-
 test/test_cauchy_cdf_double.cu                | 2 +-
 test/test_cauchy_cdf_float.cu                 | 2 +-
 test/test_cauchy_pdf_double.cu                | 2 +-
 test/test_cauchy_pdf_float.cu                 | 2 +-
 test/test_cauchy_quan_double.cu               | 2 +-
 test/test_cauchy_quan_float.cu                | 2 +-
 test/test_cauchy_range_support_double.cu      | 2 +-
 test/test_cauchy_range_support_float.cu       | 2 +-
 test/test_chi_squared_cdf_double.cu           | 2 +-
 test/test_chi_squared_cdf_float.cu            | 2 +-
 test/test_chi_squared_pdf_double.cu           | 2 +-
 test/test_chi_squared_pdf_float.cu            | 2 +-
 test/test_chi_squared_quan_double.cu          | 2 +-
 test/test_chi_squared_quan_float.cu           | 2 +-
 test/test_exponential_cdf_double.cu           | 2 +-
 test/test_exponential_cdf_float.cu            | 2 +-
 test/test_exponential_pdf_double.cu           | 2 +-
 test/test_exponential_pdf_float.cu            | 2 +-
 test/test_exponential_quan_double.cu          | 2 +-
 test/test_exponential_quan_float.cu           | 2 +-
 test/test_exponential_range_support_double.cu | 2 +-
 test/test_exponential_range_support_float.cu  | 2 +-
 test/test_extreme_value_cdf_double.cu         | 2 +-
 test/test_extreme_value_cdf_float.cu          | 2 +-
 test/test_extreme_value_pdf_double.cu         | 2 +-
 test/test_extreme_value_pdf_float.cu          | 2 +-
 test/test_extreme_value_quan_double.cu        | 2 +-
 test/test_extreme_value_quan_float.cu         | 2 +-
 test/test_fisher_f_cdf_double.cu              | 2 +-
 test/test_fisher_f_cdf_float.cu               | 2 +-
 test/test_fisher_f_pdf_double.cu              | 2 +-
 test/test_fisher_f_pdf_float.cu               | 2 +-
 test/test_fisher_f_quan_double.cu             | 2 +-
 test/test_fisher_f_quan_float.cu              | 2 +-
 test/test_gamma_dist_cdf_double.cu            | 2 +-
 test/test_gamma_dist_cdf_float.cu             | 2 +-
 test/test_gamma_dist_pdf_double.cu            | 2 +-
 test/test_gamma_dist_pdf_float.cu             | 2 +-
 test/test_gamma_dist_quan_double.cu           | 2 +-
 test/test_gamma_dist_quan_float.cu            | 2 +-
 test/test_holtsmark_cdf_double.cu             | 2 +-
 test/test_holtsmark_cdf_float.cu              | 2 +-
 test/test_holtsmark_pdf_double.cu             | 2 +-
 test/test_holtsmark_pdf_float.cu              | 2 +-
 test/test_landau_cdf_double.cu                | 2 +-
 test/test_landau_cdf_float.cu                 | 2 +-
 test/test_landau_pdf_double.cu                | 2 +-
 test/test_landau_pdf_float.cu                 | 2 +-
 test/test_landau_quan_double.cu               | 2 +-
 test/test_landau_quan_float.cu                | 2 +-
 test/test_laplace_cdf_double.cu               | 2 +-
 test/test_laplace_cdf_float.cu                | 2 +-
 test/test_laplace_pdf_double.cu               | 2 +-
 test/test_laplace_pdf_float.cu                | 2 +-
 test/test_laplace_quan_double.cu              | 2 +-
 test/test_laplace_quan_float.cu               | 2 +-
 test/test_logistic_cdf_double.cu              | 2 +-
 test/test_logistic_cdf_float.cu               | 2 +-
 test/test_logistic_pdf_double.cu              | 2 +-
 test/test_logistic_pdf_float.cu               | 2 +-
 test/test_logistic_quan_double.cu             | 2 +-
 test/test_logistic_quan_float.cu              | 2 +-
 test/test_mapairy_cdf_double.cu               | 2 +-
 test/test_mapairy_cdf_float.cu                | 2 +-
 test/test_mapairy_pdf_double.cu               | 2 +-
 test/test_mapairy_pdf_float.cu                | 2 +-
 test/test_mapairy_quan_double.cu              | 2 +-
 test/test_mapairy_quan_float.cu               | 2 +-
 test/test_saspoint5_cdf_double.cu             | 2 +-
 test/test_saspoint5_cdf_float.cu              | 2 +-
 test/test_saspoint5_pdf_double.cu             | 2 +-
 test/test_saspoint5_pdf_float.cu              | 2 +-
 test/test_saspoint5_quan_double.cu            | 2 +-
 test/test_saspoint5_quan_float.cu             | 2 +-
 test/test_weibull_cdf_double.cu               | 2 +-
 test/test_weibull_cdf_float.cu                | 2 +-
 test/test_weibull_pdf_double.cu               | 2 +-
 test/test_weibull_pdf_float.cu                | 2 +-
 test/test_weibull_quan_double.cu              | 2 +-
 test/test_weibull_quan_float.cu               | 2 +-
 100 files changed, 100 insertions(+), 100 deletions(-)

diff --git a/test/test_arcsine_cdf_double.cu b/test/test_arcsine_cdf_double.cu
index d6f6f7b35..3ac9e22cd 100644
--- a/test/test_arcsine_cdf_double.cu
+++ b/test/test_arcsine_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_cdf_float.cu b/test/test_arcsine_cdf_float.cu
index 148b1dffb..cc73ce95b 100644
--- a/test/test_arcsine_cdf_float.cu
+++ b/test/test_arcsine_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_pdf_double.cu b/test/test_arcsine_pdf_double.cu
index 7a73bb34e..8f45017ba 100644
--- a/test/test_arcsine_pdf_double.cu
+++ b/test/test_arcsine_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_pdf_float.cu b/test/test_arcsine_pdf_float.cu
index 54a11253c..c236b7876 100644
--- a/test/test_arcsine_pdf_float.cu
+++ b/test/test_arcsine_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_quan_double.cu b/test/test_arcsine_quan_double.cu
index 31f6eac8a..a45737063 100644
--- a/test/test_arcsine_quan_double.cu
+++ b/test/test_arcsine_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_quan_float.cu b/test/test_arcsine_quan_float.cu
index 6decb347b..fd8cd11fc 100644
--- a/test/test_arcsine_quan_float.cu
+++ b/test/test_arcsine_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_range_support_double.cu b/test/test_arcsine_range_support_double.cu
index cec919a1a..b3fb575fa 100644
--- a/test/test_arcsine_range_support_double.cu
+++ b/test/test_arcsine_range_support_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_range_support_float.cu b/test/test_arcsine_range_support_float.cu
index d397e0c86..d207d0598 100644
--- a/test/test_arcsine_range_support_float.cu
+++ b/test/test_arcsine_range_support_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_cdf_double.cu b/test/test_bernoulli_cdf_double.cu
index e4c21ca06..1a6dce645 100644
--- a/test/test_bernoulli_cdf_double.cu
+++ b/test/test_bernoulli_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_cdf_float.cu b/test/test_bernoulli_cdf_float.cu
index 82c0eabc0..998f24736 100644
--- a/test/test_bernoulli_cdf_float.cu
+++ b/test/test_bernoulli_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_pdf_double.cu b/test/test_bernoulli_pdf_double.cu
index 24b33c16c..147e2f340 100644
--- a/test/test_bernoulli_pdf_double.cu
+++ b/test/test_bernoulli_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_pdf_float.cu b/test/test_bernoulli_pdf_float.cu
index 08d2ca5a0..49eaea32f 100644
--- a/test/test_bernoulli_pdf_float.cu
+++ b/test/test_bernoulli_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_range_support_double.cu b/test/test_bernoulli_range_support_double.cu
index 86c77bd11..ade952fca 100644
--- a/test/test_bernoulli_range_support_double.cu
+++ b/test/test_bernoulli_range_support_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_range_support_float.cu b/test/test_bernoulli_range_support_float.cu
index cdcf54418..ef276b938 100644
--- a/test/test_bernoulli_range_support_float.cu
+++ b/test/test_bernoulli_range_support_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_cdf_double.cu b/test/test_beta_dist_cdf_double.cu
index fa460244a..9188f4305 100644
--- a/test/test_beta_dist_cdf_double.cu
+++ b/test/test_beta_dist_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_cdf_float.cu b/test/test_beta_dist_cdf_float.cu
index 321c84420..0278f6415 100644
--- a/test/test_beta_dist_cdf_float.cu
+++ b/test/test_beta_dist_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_pdf_double.cu b/test/test_beta_dist_pdf_double.cu
index c0ee9272a..e86cf94dd 100644
--- a/test/test_beta_dist_pdf_double.cu
+++ b/test/test_beta_dist_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_pdf_float.cu b/test/test_beta_dist_pdf_float.cu
index 75e4fa27b..97dd606f2 100644
--- a/test/test_beta_dist_pdf_float.cu
+++ b/test/test_beta_dist_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_quan_double.cu b/test/test_beta_dist_quan_double.cu
index 101526afa..a6b842e8e 100644
--- a/test/test_beta_dist_quan_double.cu
+++ b/test/test_beta_dist_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_quan_float.cu b/test/test_beta_dist_quan_float.cu
index 77696c639..48a860f4c 100644
--- a/test/test_beta_dist_quan_float.cu
+++ b/test/test_beta_dist_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_cdf_double.cu b/test/test_cauchy_cdf_double.cu
index dc99cbe33..526744ba1 100644
--- a/test/test_cauchy_cdf_double.cu
+++ b/test/test_cauchy_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_cdf_float.cu b/test/test_cauchy_cdf_float.cu
index dc99cbe33..526744ba1 100644
--- a/test/test_cauchy_cdf_float.cu
+++ b/test/test_cauchy_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_pdf_double.cu b/test/test_cauchy_pdf_double.cu
index 7a7fe5ba6..62398c31e 100644
--- a/test/test_cauchy_pdf_double.cu
+++ b/test/test_cauchy_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_pdf_float.cu b/test/test_cauchy_pdf_float.cu
index 5ec3b604b..aff3369b8 100644
--- a/test/test_cauchy_pdf_float.cu
+++ b/test/test_cauchy_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_quan_double.cu b/test/test_cauchy_quan_double.cu
index 21f4b4dda..0fcaaafe7 100644
--- a/test/test_cauchy_quan_double.cu
+++ b/test/test_cauchy_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_quan_float.cu b/test/test_cauchy_quan_float.cu
index b6bed1520..9c04c5b12 100644
--- a/test/test_cauchy_quan_float.cu
+++ b/test/test_cauchy_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_range_support_double.cu b/test/test_cauchy_range_support_double.cu
index 4ec792ce3..3a42c1bd3 100644
--- a/test/test_cauchy_range_support_double.cu
+++ b/test/test_cauchy_range_support_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_range_support_float.cu b/test/test_cauchy_range_support_float.cu
index 1cdd90e40..e713736e6 100644
--- a/test/test_cauchy_range_support_float.cu
+++ b/test/test_cauchy_range_support_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_cdf_double.cu b/test/test_chi_squared_cdf_double.cu
index 1b0c34ce6..c2475883b 100644
--- a/test/test_chi_squared_cdf_double.cu
+++ b/test/test_chi_squared_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_cdf_float.cu b/test/test_chi_squared_cdf_float.cu
index 8ca99ed2e..07dce0d06 100644
--- a/test/test_chi_squared_cdf_float.cu
+++ b/test/test_chi_squared_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_pdf_double.cu b/test/test_chi_squared_pdf_double.cu
index ed45246d3..30edafd05 100644
--- a/test/test_chi_squared_pdf_double.cu
+++ b/test/test_chi_squared_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_pdf_float.cu b/test/test_chi_squared_pdf_float.cu
index 5a0f97db9..9b205182b 100644
--- a/test/test_chi_squared_pdf_float.cu
+++ b/test/test_chi_squared_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_quan_double.cu b/test/test_chi_squared_quan_double.cu
index 3b7dad972..3fae7d966 100644
--- a/test/test_chi_squared_quan_double.cu
+++ b/test/test_chi_squared_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_quan_float.cu b/test/test_chi_squared_quan_float.cu
index 3e779a090..7a717530e 100644
--- a/test/test_chi_squared_quan_float.cu
+++ b/test/test_chi_squared_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_cdf_double.cu b/test/test_exponential_cdf_double.cu
index 8601d1c08..e3a57e86e 100644
--- a/test/test_exponential_cdf_double.cu
+++ b/test/test_exponential_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_cdf_float.cu b/test/test_exponential_cdf_float.cu
index aa5ef9153..ed214a495 100644
--- a/test/test_exponential_cdf_float.cu
+++ b/test/test_exponential_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_pdf_double.cu b/test/test_exponential_pdf_double.cu
index 9a5615f1b..530b1023b 100644
--- a/test/test_exponential_pdf_double.cu
+++ b/test/test_exponential_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_pdf_float.cu b/test/test_exponential_pdf_float.cu
index f15ee3ea8..0801e2d0b 100644
--- a/test/test_exponential_pdf_float.cu
+++ b/test/test_exponential_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_quan_double.cu b/test/test_exponential_quan_double.cu
index ea5a5a681..f4eb4c3b1 100644
--- a/test/test_exponential_quan_double.cu
+++ b/test/test_exponential_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_quan_float.cu b/test/test_exponential_quan_float.cu
index ea5a5a681..f4eb4c3b1 100644
--- a/test/test_exponential_quan_float.cu
+++ b/test/test_exponential_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_range_support_double.cu b/test/test_exponential_range_support_double.cu
index eec3981d2..c19497ed5 100644
--- a/test/test_exponential_range_support_double.cu
+++ b/test/test_exponential_range_support_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_range_support_float.cu b/test/test_exponential_range_support_float.cu
index 00f443e52..a111090de 100644
--- a/test/test_exponential_range_support_float.cu
+++ b/test/test_exponential_range_support_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_cdf_double.cu b/test/test_extreme_value_cdf_double.cu
index 8f7f366b3..7ca000348 100644
--- a/test/test_extreme_value_cdf_double.cu
+++ b/test/test_extreme_value_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_cdf_float.cu b/test/test_extreme_value_cdf_float.cu
index d1b6cc762..bc3ead6eb 100644
--- a/test/test_extreme_value_cdf_float.cu
+++ b/test/test_extreme_value_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_pdf_double.cu b/test/test_extreme_value_pdf_double.cu
index 4cf3fc2d0..44ccc5b71 100644
--- a/test/test_extreme_value_pdf_double.cu
+++ b/test/test_extreme_value_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_pdf_float.cu b/test/test_extreme_value_pdf_float.cu
index c0c5da7ee..390622f40 100644
--- a/test/test_extreme_value_pdf_float.cu
+++ b/test/test_extreme_value_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_quan_double.cu b/test/test_extreme_value_quan_double.cu
index 703d2054f..41f2f69a6 100644
--- a/test/test_extreme_value_quan_double.cu
+++ b/test/test_extreme_value_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_quan_float.cu b/test/test_extreme_value_quan_float.cu
index 25d982cd0..5fe16e9a8 100644
--- a/test/test_extreme_value_quan_float.cu
+++ b/test/test_extreme_value_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_fisher_f_cdf_double.cu b/test/test_fisher_f_cdf_double.cu
index 877961166..c6d6f0a94 100644
--- a/test/test_fisher_f_cdf_double.cu
+++ b/test/test_fisher_f_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_fisher_f_cdf_float.cu b/test/test_fisher_f_cdf_float.cu
index a6fcc9f98..9df1bc869 100644
--- a/test/test_fisher_f_cdf_float.cu
+++ b/test/test_fisher_f_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_fisher_f_pdf_double.cu b/test/test_fisher_f_pdf_double.cu
index e4ae50791..77a3b655a 100644
--- a/test/test_fisher_f_pdf_double.cu
+++ b/test/test_fisher_f_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_fisher_f_pdf_float.cu b/test/test_fisher_f_pdf_float.cu
index 7b7583736..323edf342 100644
--- a/test/test_fisher_f_pdf_float.cu
+++ b/test/test_fisher_f_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_fisher_f_quan_double.cu b/test/test_fisher_f_quan_double.cu
index 42bcb0dac..c16eb2a95 100644
--- a/test/test_fisher_f_quan_double.cu
+++ b/test/test_fisher_f_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_fisher_f_quan_float.cu b/test/test_fisher_f_quan_float.cu
index 3a0bc688b..85cf47967 100644
--- a/test/test_fisher_f_quan_float.cu
+++ b/test/test_fisher_f_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_gamma_dist_cdf_double.cu b/test/test_gamma_dist_cdf_double.cu
index 6424850c3..4777196aa 100644
--- a/test/test_gamma_dist_cdf_double.cu
+++ b/test/test_gamma_dist_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_gamma_dist_cdf_float.cu b/test/test_gamma_dist_cdf_float.cu
index 4f2312ccc..a93aca395 100644
--- a/test/test_gamma_dist_cdf_float.cu
+++ b/test/test_gamma_dist_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_gamma_dist_pdf_double.cu b/test/test_gamma_dist_pdf_double.cu
index 2f8bbc5f4..a8411d5b6 100644
--- a/test/test_gamma_dist_pdf_double.cu
+++ b/test/test_gamma_dist_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_gamma_dist_pdf_float.cu b/test/test_gamma_dist_pdf_float.cu
index 2080f5ccf..6ab3247ac 100644
--- a/test/test_gamma_dist_pdf_float.cu
+++ b/test/test_gamma_dist_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_gamma_dist_quan_double.cu b/test/test_gamma_dist_quan_double.cu
index bde18fc36..d29bf6d6b 100644
--- a/test/test_gamma_dist_quan_double.cu
+++ b/test/test_gamma_dist_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_gamma_dist_quan_float.cu b/test/test_gamma_dist_quan_float.cu
index 01ce85dfd..58aa42e90 100644
--- a/test/test_gamma_dist_quan_float.cu
+++ b/test/test_gamma_dist_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_holtsmark_cdf_double.cu b/test/test_holtsmark_cdf_double.cu
index 5a02b7ddb..6b1d57041 100644
--- a/test/test_holtsmark_cdf_double.cu
+++ b/test/test_holtsmark_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_holtsmark_cdf_float.cu b/test/test_holtsmark_cdf_float.cu
index 71ae21f73..2a3533bac 100644
--- a/test/test_holtsmark_cdf_float.cu
+++ b/test/test_holtsmark_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_holtsmark_pdf_double.cu b/test/test_holtsmark_pdf_double.cu
index b883515a7..a53360d20 100644
--- a/test/test_holtsmark_pdf_double.cu
+++ b/test/test_holtsmark_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_holtsmark_pdf_float.cu b/test/test_holtsmark_pdf_float.cu
index c56815973..57052803f 100644
--- a/test/test_holtsmark_pdf_float.cu
+++ b/test/test_holtsmark_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_cdf_double.cu b/test/test_landau_cdf_double.cu
index 092fff00e..40bff707d 100644
--- a/test/test_landau_cdf_double.cu
+++ b/test/test_landau_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_cdf_float.cu b/test/test_landau_cdf_float.cu
index 143755aff..c4513c084 100644
--- a/test/test_landau_cdf_float.cu
+++ b/test/test_landau_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_pdf_double.cu b/test/test_landau_pdf_double.cu
index eea6f87ad..6ce3f5f78 100644
--- a/test/test_landau_pdf_double.cu
+++ b/test/test_landau_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_pdf_float.cu b/test/test_landau_pdf_float.cu
index a424bdd67..5818ddf8a 100644
--- a/test/test_landau_pdf_float.cu
+++ b/test/test_landau_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_quan_double.cu b/test/test_landau_quan_double.cu
index 8cdf12588..4995bd49c 100644
--- a/test/test_landau_quan_double.cu
+++ b/test/test_landau_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_quan_float.cu b/test/test_landau_quan_float.cu
index 8cdf12588..4995bd49c 100644
--- a/test/test_landau_quan_float.cu
+++ b/test/test_landau_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_cdf_double.cu b/test/test_laplace_cdf_double.cu
index cddcfa2bc..ec3c83ecd 100644
--- a/test/test_laplace_cdf_double.cu
+++ b/test/test_laplace_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_cdf_float.cu b/test/test_laplace_cdf_float.cu
index 2af43f9f5..96acea2fd 100644
--- a/test/test_laplace_cdf_float.cu
+++ b/test/test_laplace_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_pdf_double.cu b/test/test_laplace_pdf_double.cu
index 2f53c0dd1..568be622b 100644
--- a/test/test_laplace_pdf_double.cu
+++ b/test/test_laplace_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_pdf_float.cu b/test/test_laplace_pdf_float.cu
index a8d673dba..cb2aa67c1 100644
--- a/test/test_laplace_pdf_float.cu
+++ b/test/test_laplace_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_quan_double.cu b/test/test_laplace_quan_double.cu
index cddcfa2bc..ec3c83ecd 100644
--- a/test/test_laplace_quan_double.cu
+++ b/test/test_laplace_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_quan_float.cu b/test/test_laplace_quan_float.cu
index 2af43f9f5..96acea2fd 100644
--- a/test/test_laplace_quan_float.cu
+++ b/test/test_laplace_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_cdf_double.cu b/test/test_logistic_cdf_double.cu
index 5dd3723c5..6b4e85025 100644
--- a/test/test_logistic_cdf_double.cu
+++ b/test/test_logistic_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_cdf_float.cu b/test/test_logistic_cdf_float.cu
index 89d05747b..75b6ab0af 100644
--- a/test/test_logistic_cdf_float.cu
+++ b/test/test_logistic_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_pdf_double.cu b/test/test_logistic_pdf_double.cu
index 39aaa1597..90232a2d6 100644
--- a/test/test_logistic_pdf_double.cu
+++ b/test/test_logistic_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_pdf_float.cu b/test/test_logistic_pdf_float.cu
index 279112b99..0a99ff9cf 100644
--- a/test/test_logistic_pdf_float.cu
+++ b/test/test_logistic_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_quan_double.cu b/test/test_logistic_quan_double.cu
index ad929d442..afe8a4c8c 100644
--- a/test/test_logistic_quan_double.cu
+++ b/test/test_logistic_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_quan_float.cu b/test/test_logistic_quan_float.cu
index 81c22ea4b..92c371062 100644
--- a/test/test_logistic_quan_float.cu
+++ b/test/test_logistic_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_cdf_double.cu b/test/test_mapairy_cdf_double.cu
index 1494181bf..7cb62a934 100644
--- a/test/test_mapairy_cdf_double.cu
+++ b/test/test_mapairy_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_cdf_float.cu b/test/test_mapairy_cdf_float.cu
index 41dd4615a..b67c0ee93 100644
--- a/test/test_mapairy_cdf_float.cu
+++ b/test/test_mapairy_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_pdf_double.cu b/test/test_mapairy_pdf_double.cu
index ad3abfbee..4ccd8b2f2 100644
--- a/test/test_mapairy_pdf_double.cu
+++ b/test/test_mapairy_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_pdf_float.cu b/test/test_mapairy_pdf_float.cu
index cabee4a2f..520ac9a68 100644
--- a/test/test_mapairy_pdf_float.cu
+++ b/test/test_mapairy_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_quan_double.cu b/test/test_mapairy_quan_double.cu
index fe6265eff..378700020 100644
--- a/test/test_mapairy_quan_double.cu
+++ b/test/test_mapairy_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_quan_float.cu b/test/test_mapairy_quan_float.cu
index ad2f6b5eb..cd9d12007 100644
--- a/test/test_mapairy_quan_float.cu
+++ b/test/test_mapairy_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_cdf_double.cu b/test/test_saspoint5_cdf_double.cu
index 745ca2bf8..fb3e2f74c 100644
--- a/test/test_saspoint5_cdf_double.cu
+++ b/test/test_saspoint5_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_cdf_float.cu b/test/test_saspoint5_cdf_float.cu
index 51bc2e870..325a470bb 100644
--- a/test/test_saspoint5_cdf_float.cu
+++ b/test/test_saspoint5_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_pdf_double.cu b/test/test_saspoint5_pdf_double.cu
index 948a09260..5392a328b 100644
--- a/test/test_saspoint5_pdf_double.cu
+++ b/test/test_saspoint5_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_pdf_float.cu b/test/test_saspoint5_pdf_float.cu
index 4980e9070..01fbcd472 100644
--- a/test/test_saspoint5_pdf_float.cu
+++ b/test/test_saspoint5_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_quan_double.cu b/test/test_saspoint5_quan_double.cu
index 764c27899..7415f0690 100644
--- a/test/test_saspoint5_quan_double.cu
+++ b/test/test_saspoint5_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_quan_float.cu b/test/test_saspoint5_quan_float.cu
index a65958fb8..d6f49084b 100644
--- a/test/test_saspoint5_quan_float.cu
+++ b/test/test_saspoint5_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_cdf_double.cu b/test/test_weibull_cdf_double.cu
index 65efbe252..1b2e5cf0d 100644
--- a/test/test_weibull_cdf_double.cu
+++ b/test/test_weibull_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_cdf_float.cu b/test/test_weibull_cdf_float.cu
index 65c3ce1ff..76bf3a4e1 100644
--- a/test/test_weibull_cdf_float.cu
+++ b/test/test_weibull_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_pdf_double.cu b/test/test_weibull_pdf_double.cu
index 645df4c0a..dd48b57d6 100644
--- a/test/test_weibull_pdf_double.cu
+++ b/test/test_weibull_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_pdf_float.cu b/test/test_weibull_pdf_float.cu
index f1e6917f0..40064b1ed 100644
--- a/test/test_weibull_pdf_float.cu
+++ b/test/test_weibull_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_quan_double.cu b/test/test_weibull_quan_double.cu
index 2f0500602..9263fb536 100644
--- a/test/test_weibull_quan_double.cu
+++ b/test/test_weibull_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_quan_float.cu b/test/test_weibull_quan_float.cu
index 3027e14dd..5dd6bd6ee 100644
--- a/test/test_weibull_quan_float.cu
+++ b/test/test_weibull_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 

From f9b91acf6354b0965625b23d8bfa3d0128e2ed8f Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 13:39:34 -0400
Subject: [PATCH 17/31] Add GPU support to the geometric dist

---
 .../boost/math/distributions/geometric.hpp    | 91 +++++++++----------
 1 file changed, 44 insertions(+), 47 deletions(-)

diff --git a/include/boost/math/distributions/geometric.hpp b/include/boost/math/distributions/geometric.hpp
index 7c511ef2d..8aa78ddc9 100644
--- a/include/boost/math/distributions/geometric.hpp
+++ b/include/boost/math/distributions/geometric.hpp
@@ -36,6 +36,9 @@
 #ifndef BOOST_MATH_SPECIAL_GEOMETRIC_HPP
 #define BOOST_MATH_SPECIAL_GEOMETRIC_HPP
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/beta.hpp> // for ibeta(a, b, x) == Ix(a, b).
 #include <boost/math/distributions/complement.hpp> // complement.
@@ -45,10 +48,6 @@
 #include <boost/math/distributions/detail/inv_discrete_quantile.hpp>
 #include <boost/math/special_functions/log1p.hpp>
 
-#include <limits> // using std::numeric_limits;
-#include <utility>
-#include <cmath>
-
 #if defined (BOOST_MSVC)
 #  pragma warning(push)
 // This believed not now necessary, so commented out.
@@ -64,7 +63,7 @@ namespace boost
     {
       // Common error checking routines for geometric distribution function:
       template <class RealType, class Policy>
-      inline bool check_success_fraction(const char* function, const RealType& p, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_success_fraction(const char* function, const RealType& p, RealType* result, const Policy& pol)
       {
         if( !(boost::math::isfinite)(p) || (p < 0) || (p > 1) )
         {
@@ -77,13 +76,13 @@ namespace boost
       }
 
       template <class RealType, class Policy>
-      inline bool check_dist(const char* function, const RealType& p, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist(const char* function, const RealType& p, RealType* result, const Policy& pol)
       {
         return check_success_fraction(function, p, result, pol);
       }
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_k(const char* function,  const RealType& p, RealType k, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist_and_k(const char* function,  const RealType& p, RealType k, RealType* result, const Policy& pol)
       {
         if(check_dist(function, p, result, pol) == false)
         {
@@ -100,7 +99,7 @@ namespace boost
       } // Check_dist_and_k
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_prob(const char* function, RealType p, RealType prob, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist_and_prob(const char* function, RealType p, RealType prob, RealType* result, const Policy& pol)
       {
         if((check_dist(function, p, result, pol) && detail::check_probability(function, prob, result, pol)) == false)
         {
@@ -117,7 +116,7 @@ namespace boost
       typedef RealType value_type;
       typedef Policy policy_type;
 
-      geometric_distribution(RealType p) : m_p(p)
+      BOOST_MATH_GPU_ENABLED geometric_distribution(RealType p) : m_p(p)
       { // Constructor stores success_fraction p.
         RealType result;
         geometric_detail::check_dist(
@@ -127,22 +126,22 @@ namespace boost
       } // geometric_distribution constructor.
 
       // Private data getter class member functions.
-      RealType success_fraction() const
+      BOOST_MATH_GPU_ENABLED RealType success_fraction() const
       { // Probability of success as fraction in range 0 to 1.
         return m_p;
       }
-      RealType successes() const
+      BOOST_MATH_GPU_ENABLED RealType successes() const
       { // Total number of successes r = 1 (for compatibility with negative binomial?).
         return 1;
       }
 
       // Parameter estimation.
       // (These are copies of negative_binomial distribution with successes = 1).
-      static RealType find_lower_bound_on_p(
+      BOOST_MATH_GPU_ENABLED static RealType find_lower_bound_on_p(
         RealType trials,
         RealType alpha) // alpha 0.05 equivalent to 95% for one-sided test.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_lower_bound_on_p";
+        constexpr auto function = "boost::math::geometric<%1%>::find_lower_bound_on_p";
         RealType result = 0;  // of error checks.
         RealType successes = 1;
         RealType failures = trials - successes;
@@ -163,11 +162,11 @@ namespace boost
         return ibeta_inv(successes, failures + 1, alpha, static_cast<RealType*>(nullptr), Policy());
       } // find_lower_bound_on_p
 
-      static RealType find_upper_bound_on_p(
+      BOOST_MATH_GPU_ENABLED static RealType find_upper_bound_on_p(
         RealType trials,
         RealType alpha) // alpha 0.05 equivalent to 95% for one-sided test.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_upper_bound_on_p";
+        constexpr auto function = "boost::math::geometric<%1%>::find_upper_bound_on_p";
         RealType result = 0;  // of error checks.
         RealType successes = 1;
         RealType failures = trials - successes;
@@ -195,12 +194,12 @@ namespace boost
       // Estimate number of trials :
       // "How many trials do I need to be P% sure of seeing k or fewer failures?"
 
-      static RealType find_minimum_number_of_trials(
+      BOOST_MATH_GPU_ENABLED static RealType find_minimum_number_of_trials(
         RealType k,     // number of failures (k >= 0).
         RealType p,     // success fraction 0 <= p <= 1.
         RealType alpha) // risk level threshold 0 <= alpha <= 1.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_minimum_number_of_trials";
+        constexpr auto function = "boost::math::geometric<%1%>::find_minimum_number_of_trials";
         // Error checks:
         RealType result = 0;
         if(false == geometric_detail::check_dist_and_k(
@@ -218,7 +217,7 @@ namespace boost
         RealType p,     // success fraction 0 <= p <= 1.
         RealType alpha) // risk level threshold 0 <= alpha <= 1.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_maximum_number_of_trials";
+        constexpr auto function = "boost::math::geometric<%1%>::find_maximum_number_of_trials";
         // Error checks:
         RealType result = 0;
         if(false == geometric_detail::check_dist_and_k(
@@ -244,22 +243,22 @@ namespace boost
     #endif
 
     template <class RealType, class Policy>
-    inline const std::pair<RealType, RealType> range(const geometric_distribution<RealType, Policy>& /* dist */)
+    BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> range(const geometric_distribution<RealType, Policy>& /* dist */)
     { // Range of permissible values for random variable k.
        using boost::math::tools::max_value;
-       return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>()); // max_integer?
+       return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>()); // max_integer?
     }
 
     template <class RealType, class Policy>
-    inline const std::pair<RealType, RealType> support(const geometric_distribution<RealType, Policy>& /* dist */)
+    BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> support(const geometric_distribution<RealType, Policy>& /* dist */)
     { // Range of supported values for random variable k.
        // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
        using boost::math::tools::max_value;
-       return std::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>()); // max_integer?
+       return boost::math::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>()); // max_integer?
     }
 
     template <class RealType, class Policy>
-    inline RealType mean(const geometric_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType mean(const geometric_distribution<RealType, Policy>& dist)
     { // Mean of geometric distribution = (1-p)/p.
       return (1 - dist.success_fraction() ) / dist.success_fraction();
     } // mean
@@ -267,21 +266,21 @@ namespace boost
     // median implemented via quantile(half) in derived accessors.
 
     template <class RealType, class Policy>
-    inline RealType mode(const geometric_distribution<RealType, Policy>&)
+    BOOST_MATH_GPU_ENABLED inline RealType mode(const geometric_distribution<RealType, Policy>&)
     { // Mode of geometric distribution = zero.
       BOOST_MATH_STD_USING // ADL of std functions.
       return 0;
     } // mode
 
     template <class RealType, class Policy>
-    inline RealType variance(const geometric_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType variance(const geometric_distribution<RealType, Policy>& dist)
     { // Variance of Binomial distribution = (1-p) / p^2.
       return  (1 - dist.success_fraction())
         / (dist.success_fraction() * dist.success_fraction());
     } // variance
 
     template <class RealType, class Policy>
-    inline RealType skewness(const geometric_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType skewness(const geometric_distribution<RealType, Policy>& dist)
     { // skewness of geometric distribution = 2-p / (sqrt(r(1-p))
       BOOST_MATH_STD_USING // ADL of std functions.
       RealType p = dist.success_fraction();
@@ -289,7 +288,7 @@ namespace boost
     } // skewness
 
     template <class RealType, class Policy>
-    inline RealType kurtosis(const geometric_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const geometric_distribution<RealType, Policy>& dist)
     { // kurtosis of geometric distribution
       // http://en.wikipedia.org/wiki/geometric is kurtosis_excess so add 3
       RealType p = dist.success_fraction();
@@ -297,7 +296,7 @@ namespace boost
     } // kurtosis
 
      template <class RealType, class Policy>
-    inline RealType kurtosis_excess(const geometric_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const geometric_distribution<RealType, Policy>& dist)
     { // kurtosis excess of geometric distribution
       // http://mathworld.wolfram.com/Kurtosis.html table of kurtosis_excess
       RealType p = dist.success_fraction();
@@ -312,11 +311,11 @@ namespace boost
     // chf of geometric distribution provided by derived accessors.
 
     template <class RealType, class Policy>
-    inline RealType pdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
+    BOOST_MATH_GPU_ENABLED inline RealType pdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
     { // Probability Density/Mass Function.
       BOOST_FPU_EXCEPTION_GUARD
       BOOST_MATH_STD_USING  // For ADL of math functions.
-      static const char* function = "boost::math::pdf(const geometric_distribution<%1%>&, %1%)";
+      constexpr auto function = "boost::math::pdf(const geometric_distribution<%1%>&, %1%)";
 
       RealType p = dist.success_fraction();
       RealType result = 0;
@@ -350,9 +349,9 @@ namespace boost
     } // geometric_pdf
 
     template <class RealType, class Policy>
-    inline RealType cdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
+    BOOST_MATH_GPU_ENABLED inline RealType cdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
     { // Cumulative Distribution Function of geometric.
-      static const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
+      constexpr auto function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
 
       // k argument may be integral, signed, or unsigned, or floating point.
       // If necessary, it has already been promoted from an integral type.
@@ -381,12 +380,10 @@ namespace boost
     } // cdf Cumulative Distribution Function geometric.
 
     template <class RealType, class Policy>
-    inline RealType logcdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
+    BOOST_MATH_GPU_ENABLED inline RealType logcdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
     { // Cumulative Distribution Function of geometric.
-      using std::pow;
-      using std::log;
-      using std::exp;
-      static const char* function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)";
+      BOOST_MATH_STD_USING
+      constexpr auto function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)";
 
       // k argument may be integral, signed, or unsigned, or floating point.
       // If necessary, it has already been promoted from an integral type.
@@ -399,7 +396,7 @@ namespace boost
         k,
         &result, Policy()))
       {
-        return -std::numeric_limits<RealType>::infinity();
+        return -boost::math::numeric_limits<RealType>::infinity();
       }
       if(k == 0)
       {
@@ -413,10 +410,10 @@ namespace boost
     } // logcdf Cumulative Distribution Function geometric.
 
     template <class RealType, class Policy>
-    inline RealType cdf(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
+    BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
     { // Complemented Cumulative Distribution Function geometric.
       BOOST_MATH_STD_USING
-      static const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
+      constexpr auto function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
       // k argument may be integral, signed, or unsigned, or floating point.
       // If necessary, it has already been promoted from an integral type.
       RealType const& k = c.param;
@@ -438,10 +435,10 @@ namespace boost
     } // cdf Complemented Cumulative Distribution Function geometric.
 
     template <class RealType, class Policy>
-    inline RealType logcdf(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
+    BOOST_MATH_GPU_ENABLED inline RealType logcdf(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
     { // Complemented Cumulative Distribution Function geometric.
       BOOST_MATH_STD_USING
-      static const char* function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)";
+      constexpr auto function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)";
       // k argument may be integral, signed, or unsigned, or floating point.
       // If necessary, it has already been promoted from an integral type.
       RealType const& k = c.param;
@@ -455,21 +452,21 @@ namespace boost
         k,
         &result, Policy()))
       {
-        return -std::numeric_limits<RealType>::infinity();
+        return -boost::math::numeric_limits<RealType>::infinity();
       }
 
       return boost::math::log1p(-p, Policy()) * (k+1);
     } // logcdf Complemented Cumulative Distribution Function geometric.
 
     template <class RealType, class Policy>
-    inline RealType quantile(const geometric_distribution<RealType, Policy>& dist, const RealType& x)
+    BOOST_MATH_GPU_ENABLED inline RealType quantile(const geometric_distribution<RealType, Policy>& dist, const RealType& x)
     { // Quantile, percentile/100 or Percent Point geometric function.
       // Return the number of expected failures k for a given probability p.
 
       // Inverse cumulative Distribution Function or Quantile (percentile / 100) of geometric Probability.
       // k argument may be integral, signed, or unsigned, or floating point.
 
-      static const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
+      constexpr auto function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
       BOOST_MATH_STD_USING // ADL of std functions.
 
       RealType success_fraction = dist.success_fraction();
@@ -513,11 +510,11 @@ namespace boost
     } // RealType quantile(const geometric_distribution dist, p)
 
     template <class RealType, class Policy>
-    inline RealType quantile(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
+    BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
     {  // Quantile or Percent Point Binomial function.
        // Return the number of expected failures k for a given
        // complement of the probability Q = 1 - P.
-       static const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
+       constexpr auto function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
        BOOST_MATH_STD_USING
        // Error checks:
        RealType x = c.param;

From 91cffdafb694327508351dfe366e6cca742f77eb Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 13:39:49 -0400
Subject: [PATCH 18/31] Add SYCL testing of geometric dist

---
 test/sycl_jamfile       |  1 +
 test/test_geometric.cpp | 18 +++++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index 03b130268..c9527a8c3 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -19,6 +19,7 @@ run test_exponential_dist.cpp ;
 run test_extreme_value.cpp ;
 run test_fisher_f.cpp ;
 run test_gamma_dist.cpp ;
+run test_geometric.cpp ;
 run test_holtsmark.cpp ;
 run test_landau.cpp ;
 run test_laplace.cpp ;
diff --git a/test/test_geometric.cpp b/test/test_geometric.cpp
index 928a2aa0e..13a9e090b 100644
--- a/test/test_geometric.cpp
+++ b/test/test_geometric.cpp
@@ -26,9 +26,14 @@
 #  define TEST_REAL_CONCEPT
 #endif
 
-#include <boost/math/tools/test.hpp>
+#include <boost/math/tools/config.hpp>
+
+#include "../include_private/boost/math/tools/test.hpp"
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
 using ::boost::math::concepts::real_concept;
+#endif
 
 #include <boost/math/distributions/geometric.hpp> // for geometric_distribution
 using boost::math::geometric_distribution;
@@ -64,7 +69,11 @@ void test_spot( // Test a single spot value against 'known good' values.
                RealType tol,     // Test tolerance
                RealType logtol)  // Logcdf Test tolerance.
 {
-   BOOST_IF_CONSTEXPR (std::is_same<RealType, long double>::value || std::is_same<RealType, real_concept>::value)
+   BOOST_IF_CONSTEXPR (std::is_same<RealType, long double>::value 
+                       #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
+                       || std::is_same<RealType, real_concept>::value
+                       #endif
+                       )
    {
      logtol *= 100;
    }
@@ -376,7 +385,9 @@ if(std::numeric_limits<RealType>::is_specialized)
   static_cast<RealType>(9.9000000000003448e-201L), //
   100 * tolerance); // Note difference
 
-    // p nearer unity.
+  // p nearer unity.
+  // On GPU this gets flushed to 0 which has an eps difference of 3.4e+38
+  #ifndef BOOST_MATH_HAS_GPU_SUPPORT
   BOOST_CHECK_CLOSE_FRACTION( //
   pdf(geometric_distribution<RealType>(static_cast<RealType>(0.9999)),
   static_cast<RealType>(10) ),  // Number of failures, k
@@ -384,6 +395,7 @@ if(std::numeric_limits<RealType>::is_specialized)
   // static_cast<float>(1.00156406e-040)
   static_cast<RealType>(9.999e-41), // exact from 100 digit calculator.
   2e3 * tolerance); // Note bigger tolerance needed.
+  #endif
 
   // Moshier Cephes 100 digits calculator says 9.999e-41
   //0.9999*pow(1-0.9999,10)

From e6518f5bac271a130a8ca6cf0ca2e35009ab1fdc Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 13:59:25 -0400
Subject: [PATCH 19/31] Add cuda::std::tie

---
 include/boost/math/tools/tuple.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/boost/math/tools/tuple.hpp b/include/boost/math/tools/tuple.hpp
index 82d23b8d7..dcc763e37 100644
--- a/include/boost/math/tools/tuple.hpp
+++ b/include/boost/math/tools/tuple.hpp
@@ -23,6 +23,7 @@ using cuda::std::tuple;
 
 using cuda::std::make_pair;
 
+using cuda::std::tie;
 using cuda::std::get;
 
 using cuda::std::tuple_size;

From 4609f25f2709fddd20078992a06ce8972bddf32d Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 13:59:40 -0400
Subject: [PATCH 20/31] Add GPU support to inv_discrete_quantile

---
 .../detail/inv_discrete_quantile.hpp          | 71 ++++++++++---------
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/include/boost/math/distributions/detail/inv_discrete_quantile.hpp b/include/boost/math/distributions/detail/inv_discrete_quantile.hpp
index 739a86666..f688345b7 100644
--- a/include/boost/math/distributions/detail/inv_discrete_quantile.hpp
+++ b/include/boost/math/distributions/detail/inv_discrete_quantile.hpp
@@ -6,7 +6,11 @@
 #ifndef BOOST_MATH_DISTRIBUTIONS_DETAIL_INV_DISCRETE_QUANTILE
 #define BOOST_MATH_DISTRIBUTIONS_DETAIL_INV_DISCRETE_QUANTILE
 
-#include <algorithm>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/cstdint.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/toms748_solve.hpp>
+#include <boost/math/tools/tuple.hpp>
 
 namespace boost{ namespace math{ namespace detail{
 
@@ -19,10 +23,10 @@ struct distribution_quantile_finder
    typedef typename Dist::value_type value_type;
    typedef typename Dist::policy_type policy_type;
 
-   distribution_quantile_finder(const Dist d, value_type p, bool c)
+   BOOST_MATH_GPU_ENABLED distribution_quantile_finder(const Dist d, value_type p, bool c)
       : dist(d), target(p), comp(c) {}
 
-   value_type operator()(value_type const& x)
+   BOOST_MATH_GPU_ENABLED value_type operator()(value_type const& x)
    {
       return comp ? value_type(target - cdf(complement(dist, x))) : value_type(cdf(dist, x) - target);
    }
@@ -42,24 +46,24 @@ struct distribution_quantile_finder
 // in the root no longer being bracketed.
 //
 template <class Real, class Tol>
-void adjust_bounds(Real& /* a */, Real& /* b */, Tol const& /* tol */){}
+BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& /* a */, Real& /* b */, Tol const& /* tol */){}
 
 template <class Real>
-void adjust_bounds(Real& /* a */, Real& b, tools::equal_floor const& /* tol */)
+BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& /* a */, Real& b, tools::equal_floor const& /* tol */)
 {
    BOOST_MATH_STD_USING
    b -= tools::epsilon<Real>() * b;
 }
 
 template <class Real>
-void adjust_bounds(Real& a, Real& /* b */, tools::equal_ceil const& /* tol */)
+BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& a, Real& /* b */, tools::equal_ceil const& /* tol */)
 {
    BOOST_MATH_STD_USING
    a += tools::epsilon<Real>() * a;
 }
 
 template <class Real>
-void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol */)
+BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol */)
 {
    BOOST_MATH_STD_USING
    a += tools::epsilon<Real>() * a;
@@ -69,7 +73,7 @@ void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol
 // This is where all the work is done:
 //
 template <class Dist, class Tolerance>
-typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED typename Dist::value_type 
    do_inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -78,7 +82,7 @@ typename Dist::value_type
       const typename Dist::value_type& multiplier,
       typename Dist::value_type adder,
       const Tolerance& tol,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    typedef typename Dist::value_type value_type;
    typedef typename Dist::policy_type policy_type;
@@ -100,7 +104,7 @@ typename Dist::value_type
       guess = min_bound;
 
    value_type fa = f(guess);
-   std::uintmax_t count = max_iter - 1;
+   boost::math::uintmax_t count = max_iter - 1;
    value_type fb(fa), a(guess), b =0; // Compiler warning C4701: potentially uninitialized local variable 'b' used
 
    if(fa == 0)
@@ -130,7 +134,7 @@ typename Dist::value_type
          else
          {
             b = a;
-            a = (std::max)(value_type(b - 1), value_type(0));
+            a = BOOST_MATH_GPU_SAFE_MAX(value_type(b - 1), value_type(0));
             if(a < min_bound)
                a = min_bound;
             fa = f(a);
@@ -153,7 +157,7 @@ typename Dist::value_type
       // If we're looking for a large result, then bump "adder" up
       // by a bit to increase our chances of bracketing the root:
       //
-      //adder = (std::max)(adder, 0.001f * guess);
+      //adder = BOOST_MATH_GPU_SAFE_MAX(adder, 0.001f * guess);
       if(fa < 0)
       {
          b = a + adder;
@@ -162,7 +166,7 @@ typename Dist::value_type
       }
       else
       {
-         b = (std::max)(value_type(a - adder), value_type(0));
+         b = BOOST_MATH_GPU_SAFE_MAX(value_type(a - adder), value_type(0));
          if(b < min_bound)
             b = min_bound;
       }
@@ -186,7 +190,7 @@ typename Dist::value_type
          }
          else
          {
-            b = (std::max)(value_type(a - adder), value_type(0));
+            b = BOOST_MATH_GPU_SAFE_MAX(value_type(a - adder), value_type(0));
             if(b < min_bound)
                b = min_bound;
          }
@@ -195,9 +199,8 @@ typename Dist::value_type
       }
       if(a > b)
       {
-         using std::swap;
-         swap(a, b);
-         swap(fa, fb);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(fa, fb);
       }
    }
    //
@@ -274,7 +277,7 @@ typename Dist::value_type
    //
    // Go ahead and find the root:
    //
-   std::pair<value_type, value_type> r = toms748_solve(f, a, b, fa, fb, tol, count, policy_type());
+   boost::math::pair<value_type, value_type> r = toms748_solve(f, a, b, fa, fb, tol, count, policy_type());
    max_iter += count;
    if (max_iter >= policies::get_max_root_iterations<policy_type>())
    {
@@ -293,7 +296,7 @@ typename Dist::value_type
 // is very close 1.
 //
 template <class Dist>
-inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c)
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c)
 {
    BOOST_MATH_STD_USING
    typename Dist::value_type cc = ceil(result);
@@ -325,7 +328,7 @@ inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::va
 #endif
 
 template <class Dist>
-inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c)
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c)
 {
    BOOST_MATH_STD_USING
    typename Dist::value_type cc = floor(result);
@@ -339,7 +342,11 @@ inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::val
    //
    while(true)
    {
+      #ifdef BOOST_MATH_HAS_GPU_SUPPORT
+      cc = ceil(nextafter(result, tools::max_value<typename Dist::value_type>()));
+      #else
       cc = ceil(float_next(result));
+      #endif
       if(cc > support(d).second)
          break;
       pp = c ? cdf(complement(d, cc)) : cdf(d, cc);
@@ -362,7 +369,7 @@ inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::val
 // to an int where required.
 //
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       typename Dist::value_type p,
@@ -371,7 +378,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::real>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    if(p > 0.5)
    {
@@ -393,7 +400,7 @@ inline typename Dist::value_type
 }
 
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -402,7 +409,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::integer_round_outwards>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    typedef typename Dist::value_type value_type;
    BOOST_MATH_STD_USING
@@ -436,7 +443,7 @@ inline typename Dist::value_type
 }
 
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -445,7 +452,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::integer_round_inwards>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    typedef typename Dist::value_type value_type;
    BOOST_MATH_STD_USING
@@ -479,7 +486,7 @@ inline typename Dist::value_type
 }
 
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -488,7 +495,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::integer_round_down>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    typedef typename Dist::value_type value_type;
    BOOST_MATH_STD_USING
@@ -507,7 +514,7 @@ inline typename Dist::value_type
 }
 
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -516,7 +523,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::integer_round_up>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    BOOST_MATH_STD_USING
    typename Dist::value_type pp = c ? 1 - p : p;
@@ -534,7 +541,7 @@ inline typename Dist::value_type
 }
 
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -543,7 +550,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::integer_round_nearest>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    typedef typename Dist::value_type value_type;
    BOOST_MATH_STD_USING

From ed0b3a088956c27c94d64d94b74835c09ff9d64b Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 13:59:52 -0400
Subject: [PATCH 21/31] Add CUDA testing of geometric dist

---
 test/cuda_jamfile                       |   7 ++
 test/test_geometric_dist_cdf_double.cu  | 109 ++++++++++++++++++++++++
 test/test_geometric_dist_cdf_float.cu   | 109 ++++++++++++++++++++++++
 test/test_geometric_dist_pdf_double.cu  | 109 ++++++++++++++++++++++++
 test/test_geometric_dist_pdf_float.cu   | 109 ++++++++++++++++++++++++
 test/test_geometric_dist_quan_double.cu | 109 ++++++++++++++++++++++++
 test/test_geometric_dist_quan_float.cu  | 109 ++++++++++++++++++++++++
 7 files changed, 661 insertions(+)
 create mode 100644 test/test_geometric_dist_cdf_double.cu
 create mode 100644 test/test_geometric_dist_cdf_float.cu
 create mode 100644 test/test_geometric_dist_pdf_double.cu
 create mode 100644 test/test_geometric_dist_pdf_float.cu
 create mode 100644 test/test_geometric_dist_quan_double.cu
 create mode 100644 test/test_geometric_dist_quan_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index b01aa8bb1..57a16f2c7 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -79,6 +79,13 @@ run test_gamma_dist_pdf_float.cu ;
 run test_gamma_dist_quan_double.cu ;
 run test_gamma_dist_quan_float.cu ;
 
+run test_geometric_dist_cdf_double.cu ;
+run test_geometric_dist_cdf_float.cu ;
+run test_geometric_dist_pdf_double.cu ;
+run test_geometric_dist_pdf_float.cu ;
+run test_geometric_dist_quan_double.cu ;
+run test_geometric_dist_quan_float.cu ;
+
 run test_holtsmark_cdf_double.cu ;
 run test_holtsmark_cdf_float.cu ;
 run test_holtsmark_pdf_double.cu ;
diff --git a/test/test_geometric_dist_cdf_double.cu b/test/test_geometric_dist_cdf_double.cu
new file mode 100644
index 000000000..98b6510ad
--- /dev/null
+++ b/test/test_geometric_dist_cdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_cdf_float.cu b/test/test_geometric_dist_cdf_float.cu
new file mode 100644
index 000000000..2662ac07c
--- /dev/null
+++ b/test/test_geometric_dist_cdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_pdf_double.cu b/test/test_geometric_dist_pdf_double.cu
new file mode 100644
index 000000000..03d2dc007
--- /dev/null
+++ b/test/test_geometric_dist_pdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_pdf_float.cu b/test/test_geometric_dist_pdf_float.cu
new file mode 100644
index 000000000..1034d122b
--- /dev/null
+++ b/test/test_geometric_dist_pdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_quan_double.cu b/test/test_geometric_dist_quan_double.cu
new file mode 100644
index 000000000..fcac938e5
--- /dev/null
+++ b/test/test_geometric_dist_quan_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_quan_float.cu b/test/test_geometric_dist_quan_float.cu
new file mode 100644
index 000000000..89d8bea47
--- /dev/null
+++ b/test/test_geometric_dist_quan_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file

From bf31592c135856a4b8a6e60d93fd5ef6c68415e6 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 14:16:01 -0400
Subject: [PATCH 22/31] Add NVRTC testing of geometric dist

---
 .../boost/math/distributions/geometric.hpp    |   2 +-
 test/nvrtc_jamfile                            |   7 +
 test/test_geometric_dist_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++
 test/test_geometric_dist_cdf_nvrtc_float.cpp  | 191 ++++++++++++++++++
 test/test_geometric_dist_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++
 test/test_geometric_dist_pdf_nvrtc_float.cpp  | 191 ++++++++++++++++++
 .../test_geometric_dist_quan_nvrtc_double.cpp | 191 ++++++++++++++++++
 test/test_geometric_dist_quan_nvrtc_float.cpp | 191 ++++++++++++++++++
 8 files changed, 1154 insertions(+), 1 deletion(-)
 create mode 100644 test/test_geometric_dist_cdf_nvrtc_double.cpp
 create mode 100644 test/test_geometric_dist_cdf_nvrtc_float.cpp
 create mode 100644 test/test_geometric_dist_pdf_nvrtc_double.cpp
 create mode 100644 test/test_geometric_dist_pdf_nvrtc_float.cpp
 create mode 100644 test/test_geometric_dist_quan_nvrtc_double.cpp
 create mode 100644 test/test_geometric_dist_quan_nvrtc_float.cpp

diff --git a/include/boost/math/distributions/geometric.hpp b/include/boost/math/distributions/geometric.hpp
index 8aa78ddc9..0a7b383c2 100644
--- a/include/boost/math/distributions/geometric.hpp
+++ b/include/boost/math/distributions/geometric.hpp
@@ -212,7 +212,7 @@ namespace boost
         return result + k;
       } // RealType find_number_of_failures
 
-      static RealType find_maximum_number_of_trials(
+      BOOST_MATH_GPU_ENABLED static RealType find_maximum_number_of_trials(
         RealType k,     // number of failures (k >= 0).
         RealType p,     // success fraction 0 <= p <= 1.
         RealType alpha) // risk level threshold 0 <= alpha <= 1.
diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 94fc6cc9b..cf3fe89e9 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -73,6 +73,13 @@ run test_gamma_dist_pdf_nvrtc_float.cpp ;
 run test_gamma_dist_quan_nvrtc_double.cpp ;
 run test_gamma_dist_quan_nvrtc_float.cpp ;
 
+run test_geometric_dist_cdf_nvrtc_double.cpp ;
+run test_geometric_dist_cdf_nvrtc_float.cpp ;
+run test_geometric_dist_pdf_nvrtc_double.cpp ;
+run test_geometric_dist_pdf_nvrtc_float.cpp ;
+run test_geometric_dist_quan_nvrtc_double.cpp ;
+run test_geometric_dist_quan_nvrtc_float.cpp ;
+
 run test_holtsmark_cdf_nvrtc_double.cpp ;
 run test_holtsmark_cdf_nvrtc_float.cpp ;
 run test_holtsmark_pdf_nvrtc_double.cpp ;
diff --git a/test/test_geometric_dist_cdf_nvrtc_double.cpp b/test/test_geometric_dist_cdf_nvrtc_double.cpp
new file mode 100644
index 000000000..f8c5ed5aa
--- /dev/null
+++ b/test/test_geometric_dist_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric_dist_cdf_nvrtc_float.cpp b/test/test_geometric_dist_cdf_nvrtc_float.cpp
new file mode 100644
index 000000000..a53cd0d97
--- /dev/null
+++ b/test/test_geometric_dist_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric_dist_pdf_nvrtc_double.cpp b/test/test_geometric_dist_pdf_nvrtc_double.cpp
new file mode 100644
index 000000000..8a6b5756e
--- /dev/null
+++ b/test/test_geometric_dist_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric_dist_pdf_nvrtc_float.cpp b/test/test_geometric_dist_pdf_nvrtc_float.cpp
new file mode 100644
index 000000000..dfb05105d
--- /dev/null
+++ b/test/test_geometric_dist_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric_dist_quan_nvrtc_double.cpp b/test/test_geometric_dist_quan_nvrtc_double.cpp
new file mode 100644
index 000000000..52b2e97ec
--- /dev/null
+++ b/test/test_geometric_dist_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric_dist_quan_nvrtc_float.cpp b/test/test_geometric_dist_quan_nvrtc_float.cpp
new file mode 100644
index 000000000..a83cf857e
--- /dev/null
+++ b/test/test_geometric_dist_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From ca4bb46d25e7376614721606bdfcdc843f6c442b Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 14:29:32 -0400
Subject: [PATCH 23/31] Add SYCL testing of inverse_chi_squared dist

---
 .../distributions/inverse_chi_squared.hpp     | 68 +++++++++----------
 test/sycl_jamfile                             |  1 +
 .../test_inverse_chi_squared_distribution.cpp |  7 +-
 3 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/include/boost/math/distributions/inverse_chi_squared.hpp b/include/boost/math/distributions/inverse_chi_squared.hpp
index 19dd0371e..1a3c680d2 100644
--- a/include/boost/math/distributions/inverse_chi_squared.hpp
+++ b/include/boost/math/distributions/inverse_chi_squared.hpp
@@ -1,6 +1,6 @@
 // Copyright John Maddock 2010.
 // Copyright Paul A. Bristow 2010.
-
+// Copyright Matt Borland 2024.
 // Use, modification and distribution are subject to the
 // Boost Software License, Version 1.0.
 // (See accompanying file LICENSE_1_0.txt
@@ -9,6 +9,8 @@
 #ifndef BOOST_MATH_DISTRIBUTIONS_INVERSE_CHI_SQUARED_HPP
 #define BOOST_MATH_DISTRIBUTIONS_INVERSE_CHI_SQUARED_HPP
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/gamma.hpp> // for incomplete beta.
 #include <boost/math/distributions/complement.hpp> // for complements.
@@ -24,14 +26,12 @@
 // Weisstein, Eric W. "Inverse Chi-Squared Distribution." From MathWorld--A Wolfram Web Resource.
 // http://mathworld.wolfram.com/InverseChi-SquaredDistribution.html
 
-#include <utility>
-
 namespace boost{ namespace math{
 
 namespace detail
 {
   template <class RealType, class Policy>
-  inline bool check_inverse_chi_squared( // Check both distribution parameters.
+  BOOST_MATH_GPU_ENABLED inline bool check_inverse_chi_squared( // Check both distribution parameters.
         const char* function,
         RealType degrees_of_freedom, // degrees_of_freedom (aka nu).
         RealType scale,  // scale (aka sigma^2)
@@ -51,7 +51,7 @@ class inverse_chi_squared_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   inverse_chi_squared_distribution(RealType df, RealType l_scale) : m_df(df), m_scale (l_scale)
+   BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df, RealType l_scale) : m_df(df), m_scale (l_scale)
    {
       RealType result;
       detail::check_df(
@@ -62,7 +62,7 @@ class inverse_chi_squared_distribution
          m_scale, &result,  Policy());
    } // inverse_chi_squared_distribution constructor 
 
-   inverse_chi_squared_distribution(RealType df = 1) : m_df(df)
+   BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1) : m_df(df)
    {
       RealType result;
       m_scale = 1 / m_df ; // Default scale = 1 / degrees of freedom (Wikipedia definition 1).
@@ -71,11 +71,11 @@ class inverse_chi_squared_distribution
          m_df, &result, Policy());
    } // inverse_chi_squared_distribution
 
-   RealType degrees_of_freedom()const
+   BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const
    {
       return m_df; // aka nu
    }
-   RealType scale()const
+   BOOST_MATH_GPU_ENABLED RealType scale()const
    {
       return m_scale;  // aka xi
    }
@@ -105,28 +105,28 @@ inverse_chi_squared_distribution(RealType,RealType)->inverse_chi_squared_distrib
 #endif
 
 template <class RealType, class Policy>
-inline const std::pair<RealType, RealType> range(const inverse_chi_squared_distribution<RealType, Policy>& /*dist*/)
+BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> range(const inverse_chi_squared_distribution<RealType, Policy>& /*dist*/)
 {  // Range of permissible values for random variable x.
    using boost::math::tools::max_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>()); // 0 to + infinity.
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>()); // 0 to + infinity.
 }
 
 template <class RealType, class Policy>
-inline const std::pair<RealType, RealType> support(const inverse_chi_squared_distribution<RealType, Policy>& /*dist*/)
+BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> support(const inverse_chi_squared_distribution<RealType, Policy>& /*dist*/)
 {  // Range of supported values for random variable x.
    // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
-   return std::pair<RealType, RealType>(static_cast<RealType>(0), tools::max_value<RealType>()); // 0 to + infinity.
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), tools::max_value<RealType>()); // 0 to + infinity.
 }
 
 template <class RealType, class Policy>
-RealType pdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED RealType pdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions.
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
    RealType error_result;
 
-   static const char* function = "boost::math::pdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::pdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
 
    if(false == detail::check_inverse_chi_squared
      (function, df, scale, &error_result, Policy())
@@ -159,9 +159,9 @@ RealType pdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, con
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
 {
-   static const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
    RealType error_result;
@@ -188,13 +188,13 @@ inline RealType cdf(const inverse_chi_squared_distribution<RealType, Policy>& di
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& p)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& p)
 {
    using boost::math::gamma_q_inv;
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
 
-   static const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
    // Error check:
    RealType error_result;
    if(false == detail::check_df(
@@ -220,13 +220,13 @@ inline RealType quantile(const inverse_chi_squared_distribution<RealType, Policy
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<inverse_chi_squared_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<inverse_chi_squared_distribution<RealType, Policy>, RealType>& c)
 {
    using boost::math::gamma_q_inv;
    RealType const& df = c.dist.degrees_of_freedom();
    RealType const& scale = c.dist.scale();
    RealType const& x = c.param;
-   static const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
    // Error check:
    RealType error_result;
    if(false == detail::check_df(
@@ -251,14 +251,14 @@ inline RealType cdf(const complemented2_type<inverse_chi_squared_distribution<Re
 } // cdf(complemented
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<inverse_chi_squared_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<inverse_chi_squared_distribution<RealType, Policy>, RealType>& c)
 {
    using boost::math::gamma_q_inv;
 
    RealType const& df = c.dist.degrees_of_freedom();
    RealType const& scale = c.dist.scale();
    RealType const& q = c.param;
-   static const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
    // Error check:
    RealType error_result;
    if(false == detail::check_df(function, df, &error_result, Policy()))
@@ -280,12 +280,12 @@ inline RealType quantile(const complemented2_type<inverse_chi_squared_distributi
 } // quantile(const complement
 
 template <class RealType, class Policy>
-inline RealType mean(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mean(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 { // Mean of inverse Chi-Squared distribution.
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
 
-   static const char* function = "boost::math::mean(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mean(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 2)
       return policies::raise_domain_error<RealType>(
          function,
@@ -295,11 +295,11 @@ inline RealType mean(const inverse_chi_squared_distribution<RealType, Policy>& d
 } // mean
 
 template <class RealType, class Policy>
-inline RealType variance(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType variance(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 { // Variance of inverse Chi-Squared distribution.
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
-   static const char* function = "boost::math::variance(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::variance(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 4)
    {
       return policies::raise_domain_error<RealType>(
@@ -311,14 +311,14 @@ inline RealType variance(const inverse_chi_squared_distribution<RealType, Policy
 } // variance
 
 template <class RealType, class Policy>
-inline RealType mode(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mode(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 { // mode is not defined in Mathematica.
   // See Discussion section http://en.wikipedia.org/wiki/Talk:Scaled-inverse-chi-square_distribution
   // for origin of the formula used below.
 
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
-   static const char* function = "boost::math::mode(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mode(const inverse_chi_squared_distribution<%1%>&)";
    if(df < 0)
       return policies::raise_domain_error<RealType>(
          function,
@@ -341,11 +341,11 @@ inline RealType mode(const inverse_chi_squared_distribution<RealType, Policy>& d
 // Now implemented via quantile(half) in derived accessors.
 
 template <class RealType, class Policy>
-inline RealType skewness(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType skewness(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING // For ADL
    RealType df = dist.degrees_of_freedom();
-   static const char* function = "boost::math::skewness(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::skewness(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 6)
       return policies::raise_domain_error<RealType>(
          function,
@@ -356,10 +356,10 @@ inline RealType skewness(const inverse_chi_squared_distribution<RealType, Policy
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 {
    RealType df = dist.degrees_of_freedom();
-   static const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 8)
       return policies::raise_domain_error<RealType>(
          function,
@@ -370,10 +370,10 @@ inline RealType kurtosis(const inverse_chi_squared_distribution<RealType, Policy
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 {
    RealType df = dist.degrees_of_freedom();
-   static const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 8)
       return policies::raise_domain_error<RealType>(
          function,
diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index c9527a8c3..e90dc0e70 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -21,6 +21,7 @@ run test_fisher_f.cpp ;
 run test_gamma_dist.cpp ;
 run test_geometric.cpp ;
 run test_holtsmark.cpp ;
+run test_inverse_chi_squared_distribution.cpp ;
 run test_landau.cpp ;
 run test_laplace.cpp ;
 run test_logistic_dist.cpp ;
diff --git a/test/test_inverse_chi_squared_distribution.cpp b/test/test_inverse_chi_squared_distribution.cpp
index a69782418..cbc9dcf19 100644
--- a/test/test_inverse_chi_squared_distribution.cpp
+++ b/test/test_inverse_chi_squared_distribution.cpp
@@ -14,11 +14,14 @@
 
 // http://www.wolframalpha.com/input/?i=inverse+chisquare+distribution
 
-#include <boost/math/tools/test.hpp>
+#include <boost/math/tools/config.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
 using ::boost::math::concepts::real_concept;
+#endif
 
-//#include <boost/math/tools/test.hpp>
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp> // for test_main
 #include <boost/test/tools/floating_point_comparison.hpp> // for BOOST_CHECK_CLOSE_FRACTION

From 5a7e304409fb87dc907360e03c011fe35cd70784 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 14:36:44 -0400
Subject: [PATCH 24/31] Adjust tol

---
 test/test_geometric_dist_quan_float.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_geometric_dist_quan_float.cu b/test/test_geometric_dist_quan_float.cu
index 89d8bea47..074952202 100644
--- a/test/test_geometric_dist_quan_float.cu
+++ b/test/test_geometric_dist_quan_float.cu
@@ -90,7 +90,7 @@ int main(void)
     // check the results
     for(int i = 0; i < numElements; ++i)
     {
-        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200.0)
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000.0)
         {
             std::cerr << "Result verification failed at element " << i << "!" << std::endl;
             std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;

From 8dd1e81f0e0ccb6e373fdd15e2f4749b874e34f4 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 14:44:29 -0400
Subject: [PATCH 25/31] Add NVRTC inverse chi squared dist testing

---
 test/nvrtc_jamfile                            |   7 +
 ...t_inverse_chi_squared_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++
 ...st_inverse_chi_squared_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++
 ...t_inverse_chi_squared_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++
 ...st_inverse_chi_squared_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++
 ..._inverse_chi_squared_quan_nvrtc_double.cpp | 191 ++++++++++++++++++
 ...t_inverse_chi_squared_quan_nvrtc_float.cpp | 191 ++++++++++++++++++
 7 files changed, 1153 insertions(+)
 create mode 100644 test/test_inverse_chi_squared_cdf_nvrtc_double.cpp
 create mode 100644 test/test_inverse_chi_squared_cdf_nvrtc_float.cpp
 create mode 100644 test/test_inverse_chi_squared_pdf_nvrtc_double.cpp
 create mode 100644 test/test_inverse_chi_squared_pdf_nvrtc_float.cpp
 create mode 100644 test/test_inverse_chi_squared_quan_nvrtc_double.cpp
 create mode 100644 test/test_inverse_chi_squared_quan_nvrtc_float.cpp

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index cf3fe89e9..0834086d4 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -87,6 +87,13 @@ run test_holtsmark_pdf_nvrtc_float.cpp ;
 run test_holtsmark_quan_nvrtc_double.cpp ;
 run test_holtsmark_quan_nvrtc_float.cpp ;
 
+run test_inverse_chi_squared_cdf_nvrtc_double.cpp ; 
+run test_inverse_chi_squared_cdf_nvrtc_float.cpp ;
+run test_inverse_chi_squared_pdf_nvrtc_double.cpp ; 
+run test_inverse_chi_squared_pdf_nvrtc_float.cpp ;
+run test_inverse_chi_squared_quan_nvrtc_double.cpp ; 
+run test_inverse_chi_squared_quan_nvrtc_float.cpp ;
+
 run test_landau_cdf_nvrtc_double.cpp ;
 run test_landau_cdf_nvrtc_float.cpp ;
 run test_landau_pdf_nvrtc_double.cpp ;
diff --git a/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp b/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp
new file mode 100644
index 000000000..b221aedaa
--- /dev/null
+++ b/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp b/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp
new file mode 100644
index 000000000..743654c14
--- /dev/null
+++ b/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp b/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp
new file mode 100644
index 000000000..4608b3bd6
--- /dev/null
+++ b/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp b/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp
new file mode 100644
index 000000000..8b4db55c0
--- /dev/null
+++ b/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_chi_squared_quan_nvrtc_double.cpp b/test/test_inverse_chi_squared_quan_nvrtc_double.cpp
new file mode 100644
index 000000000..0f8a9a5f8
--- /dev/null
+++ b/test/test_inverse_chi_squared_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_chi_squared_quan_nvrtc_float.cpp b/test/test_inverse_chi_squared_quan_nvrtc_float.cpp
new file mode 100644
index 000000000..ab494a8da
--- /dev/null
+++ b/test/test_inverse_chi_squared_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From bab0138bfcb08b9513d9c58eced6134a2268b802 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 14:53:29 -0400
Subject: [PATCH 26/31] Add CUDA inverse chi squared dist testing

---
 test/cuda_jamfile                            |   7 ++
 test/test_inverse_chi_squared_cdf_double.cu  | 110 +++++++++++++++++++
 test/test_inverse_chi_squared_cdf_float.cu   | 110 +++++++++++++++++++
 test/test_inverse_chi_squared_pdf_double.cu  | 110 +++++++++++++++++++
 test/test_inverse_chi_squared_pdf_float.cu   | 110 +++++++++++++++++++
 test/test_inverse_chi_squared_quan_double.cu | 110 +++++++++++++++++++
 test/test_inverse_chi_squared_quan_float.cu  | 110 +++++++++++++++++++
 7 files changed, 667 insertions(+)
 create mode 100644 test/test_inverse_chi_squared_cdf_double.cu
 create mode 100644 test/test_inverse_chi_squared_cdf_float.cu
 create mode 100644 test/test_inverse_chi_squared_pdf_double.cu
 create mode 100644 test/test_inverse_chi_squared_pdf_float.cu
 create mode 100644 test/test_inverse_chi_squared_quan_double.cu
 create mode 100644 test/test_inverse_chi_squared_quan_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 57a16f2c7..283267593 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -91,6 +91,13 @@ run test_holtsmark_cdf_float.cu ;
 run test_holtsmark_pdf_double.cu ;
 run test_holtsmark_pdf_float.cu ;
 
+run test_inverse_chi_squared_cdf_double.cu ;
+run test_inverse_chi_squared_cdf_float.cu ;
+run test_inverse_chi_squared_pdf_double.cu ;
+run test_inverse_chi_squared_pdf_float.cu ;
+run test_inverse_chi_squared_quan_double.cu ;
+run test_inverse_chi_squared_quan_float.cu ;
+
 run test_landau_cdf_double.cu ;
 run test_landau_cdf_float.cu ;
 run test_landau_pdf_double.cu ;
diff --git a/test/test_inverse_chi_squared_cdf_double.cu b/test/test_inverse_chi_squared_cdf_double.cu
new file mode 100644
index 000000000..9703e7a3a
--- /dev/null
+++ b/test/test_inverse_chi_squared_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_cdf_float.cu b/test/test_inverse_chi_squared_cdf_float.cu
new file mode 100644
index 000000000..bb56a4872
--- /dev/null
+++ b/test/test_inverse_chi_squared_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_pdf_double.cu b/test/test_inverse_chi_squared_pdf_double.cu
new file mode 100644
index 000000000..f30611749
--- /dev/null
+++ b/test/test_inverse_chi_squared_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_pdf_float.cu b/test/test_inverse_chi_squared_pdf_float.cu
new file mode 100644
index 000000000..8a3d1c1ef
--- /dev/null
+++ b/test/test_inverse_chi_squared_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_quan_double.cu b/test/test_inverse_chi_squared_quan_double.cu
new file mode 100644
index 000000000..f9022c6a3
--- /dev/null
+++ b/test/test_inverse_chi_squared_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_quan_float.cu b/test/test_inverse_chi_squared_quan_float.cu
new file mode 100644
index 000000000..10aa6d707
--- /dev/null
+++ b/test/test_inverse_chi_squared_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}

From 62ac8cda1cab598bfd17a9a225d9ab4837d44a17 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 15:29:51 -0400
Subject: [PATCH 27/31] Add GPU support to inverse gamma dist

---
 .../math/distributions/inverse_gamma.hpp      | 77 ++++++++++---------
 1 file changed, 39 insertions(+), 38 deletions(-)

diff --git a/include/boost/math/distributions/inverse_gamma.hpp b/include/boost/math/distributions/inverse_gamma.hpp
index 8c9e4763d..6aa798ed8 100644
--- a/include/boost/math/distributions/inverse_gamma.hpp
+++ b/include/boost/math/distributions/inverse_gamma.hpp
@@ -2,6 +2,7 @@
 
 //  Copyright Paul A. Bristow 2010.
 //  Copyright John Maddock 2010.
+//  Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -22,21 +23,21 @@
 // http://mathworld.wolfram.com/GammaDistribution.html
 // http://en.wikipedia.org/wiki/Gamma_distribution
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/gamma.hpp>
 #include <boost/math/distributions/detail/common_error_handling.hpp>
 #include <boost/math/distributions/complement.hpp>
 
-#include <utility>
-#include <cfenv>
-
 namespace boost{ namespace math
 {
 namespace detail
 {
 
 template <class RealType, class Policy>
-inline bool check_inverse_gamma_shape(
+BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma_shape(
       const char* function, // inverse_gamma
       RealType shape, // shape aka alpha
       RealType* result, // to update, perhaps with NaN
@@ -57,7 +58,7 @@ inline bool check_inverse_gamma_shape(
 } //bool check_inverse_gamma_shape
 
 template <class RealType, class Policy>
-inline bool check_inverse_gamma_x(
+BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma_x(
       const char* function,
       RealType const& x,
       RealType* result, const Policy& pol)
@@ -73,7 +74,7 @@ inline bool check_inverse_gamma_x(
 }
 
 template <class RealType, class Policy>
-inline bool check_inverse_gamma(
+BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma(
       const char* function, // TODO swap these over, so shape is first.
       RealType scale,  // scale aka beta
       RealType shape, // shape aka alpha
@@ -92,7 +93,7 @@ class inverse_gamma_distribution
    using value_type = RealType;
    using policy_type = Policy;
 
-   explicit inverse_gamma_distribution(RealType l_shape = 1, RealType l_scale = 1)
+   BOOST_MATH_GPU_ENABLED explicit inverse_gamma_distribution(RealType l_shape = 1, RealType l_scale = 1)
       : m_shape(l_shape), m_scale(l_scale)
    {
       RealType result;
@@ -101,12 +102,12 @@ class inverse_gamma_distribution
         l_scale, l_shape, &result, Policy());
    }
 
-   RealType shape()const
+   BOOST_MATH_GPU_ENABLED RealType shape()const
    {
       return m_shape;
    }
 
-   RealType scale()const
+   BOOST_MATH_GPU_ENABLED RealType scale()const
    {
       return m_scale;
    }
@@ -132,27 +133,27 @@ inverse_gamma_distribution(RealType,RealType)->inverse_gamma_distribution<typena
 // Allow random variable x to be zero, treated as a special case (unlike some definitions).
 
 template <class RealType, class Policy>
-inline std::pair<RealType, RealType> range(const inverse_gamma_distribution<RealType, Policy>& /* dist */)
+BOOST_MATH_GPU_ENABLED inline boost::math::pair<RealType, RealType> range(const inverse_gamma_distribution<RealType, Policy>& /* dist */)
 {  // Range of permissible values for random variable x.
    using boost::math::tools::max_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-inline std::pair<RealType, RealType> support(const inverse_gamma_distribution<RealType, Policy>& /* dist */)
+BOOST_MATH_GPU_ENABLED inline boost::math::pair<RealType, RealType> support(const inverse_gamma_distribution<RealType, Policy>& /* dist */)
 {  // Range of supported values for random variable x.
    // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
    using boost::math::tools::max_value;
    using boost::math::tools::min_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType pdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::pdf(const inverse_gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::pdf(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -195,17 +196,17 @@ inline RealType pdf(const inverse_gamma_distribution<RealType, Policy>& dist, co
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType logpdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType logpdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
    using boost::math::lgamma;
 
-   static const char* function = "boost::math::logpdf(const inverse_gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::logpdf(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
 
-   RealType result = -std::numeric_limits<RealType>::infinity();
+   RealType result = -boost::math::numeric_limits<RealType>::infinity();
    if(false == detail::check_inverse_gamma(function, scale, shape, &result, Policy()))
    { // distribution parameters bad.
       return result;
@@ -232,11 +233,11 @@ inline RealType logpdf(const inverse_gamma_distribution<RealType, Policy>& dist,
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const inverse_gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::cdf(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -260,12 +261,12 @@ inline RealType cdf(const inverse_gamma_distribution<RealType, Policy>& dist, co
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& p)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
    using boost::math::gamma_q_inv;
 
-   static const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -287,11 +288,11 @@ inline RealType quantile(const inverse_gamma_distribution<RealType, Policy>& dis
 }
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<inverse_gamma_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<inverse_gamma_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -310,11 +311,11 @@ inline RealType cdf(const complemented2_type<inverse_gamma_distribution<RealType
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<inverse_gamma_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<inverse_gamma_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -338,11 +339,11 @@ inline RealType quantile(const complemented2_type<inverse_gamma_distribution<Rea
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mean(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mean(const inverse_gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mean(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -365,11 +366,11 @@ inline RealType mean(const inverse_gamma_distribution<RealType, Policy>& dist)
 } // mean
 
 template <class RealType, class Policy>
-inline RealType variance(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType variance(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::variance(const inverse_gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::variance(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -391,11 +392,11 @@ inline RealType variance(const inverse_gamma_distribution<RealType, Policy>& dis
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mode(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mode(const inverse_gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mode(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -418,11 +419,11 @@ inline RealType mode(const inverse_gamma_distribution<RealType, Policy>& dist)
 //}
 
 template <class RealType, class Policy>
-inline RealType skewness(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType skewness(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::skewness(const inverse_gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::skewness(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -444,11 +445,11 @@ inline RealType skewness(const inverse_gamma_distribution<RealType, Policy>& dis
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::kurtosis_excess(const inverse_gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::kurtosis_excess(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -470,9 +471,9 @@ inline RealType kurtosis_excess(const inverse_gamma_distribution<RealType, Polic
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
-  static const char* function = "boost::math::kurtosis(const inverse_gamma_distribution<%1%>&)";
+  constexpr auto function = "boost::math::kurtosis(const inverse_gamma_distribution<%1%>&)";
    RealType shape = dist.shape();
    RealType scale = dist.scale();
 

From 2f05b01088cfe7f18be9874c59b1f9c61d4ef5b3 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 15:30:05 -0400
Subject: [PATCH 28/31] Add SYCL testing to inverse gamma dist

---
 test/sycl_jamfile                        | 1 +
 test/test_inverse_gamma_distribution.cpp | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index e90dc0e70..bcdfb7e5d 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -22,6 +22,7 @@ run test_gamma_dist.cpp ;
 run test_geometric.cpp ;
 run test_holtsmark.cpp ;
 run test_inverse_chi_squared_distribution.cpp ;
+run test_inverse_gamma_distribution.cpp ;
 run test_landau.cpp ;
 run test_laplace.cpp ;
 run test_logistic_dist.cpp ;
diff --git a/test/test_inverse_gamma_distribution.cpp b/test/test_inverse_gamma_distribution.cpp
index 68b238fbc..436131d83 100644
--- a/test/test_inverse_gamma_distribution.cpp
+++ b/test/test_inverse_gamma_distribution.cpp
@@ -14,11 +14,14 @@
 #  pragma warning (disable : 4310) // cast truncates constant value
 #endif
 
-#include <boost/math/tools/test.hpp>
+#include <boost/math/tools/config.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
+
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
 using ::boost::math::concepts::real_concept;
+#endif
 
-//#include <boost/math/tools/test.hpp>
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp> // for test_main
 #include <boost/test/tools/floating_point_comparison.hpp> // for BOOST_CHECK_CLOSE_FRACTION

From c95d73ceaac5ad6838ab68ce9b0e02b67fc0a07f Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 15:43:00 -0400
Subject: [PATCH 29/31] Add NVRTC testing of inverse gamma dist

---
 test/nvrtc_jamfile                            |   7 +
 test/test_inverse_gamma_cdf_nvrtc_double.cpp  | 191 ++++++++++++++++++
 test/test_inverse_gamma_cdf_nvrtc_float.cpp   | 191 ++++++++++++++++++
 test/test_inverse_gamma_pdf_nvrtc_double.cpp  | 191 ++++++++++++++++++
 test/test_inverse_gamma_pdf_nvrtc_float.cpp   | 191 ++++++++++++++++++
 test/test_inverse_gamma_quan_nvrtc_double.cpp | 191 ++++++++++++++++++
 test/test_inverse_gamma_quan_nvrtc_float.cpp  | 191 ++++++++++++++++++
 7 files changed, 1153 insertions(+)
 create mode 100644 test/test_inverse_gamma_cdf_nvrtc_double.cpp
 create mode 100644 test/test_inverse_gamma_cdf_nvrtc_float.cpp
 create mode 100644 test/test_inverse_gamma_pdf_nvrtc_double.cpp
 create mode 100644 test/test_inverse_gamma_pdf_nvrtc_float.cpp
 create mode 100644 test/test_inverse_gamma_quan_nvrtc_double.cpp
 create mode 100644 test/test_inverse_gamma_quan_nvrtc_float.cpp

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 0834086d4..1b001eb2f 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -94,6 +94,13 @@ run test_inverse_chi_squared_pdf_nvrtc_float.cpp ;
 run test_inverse_chi_squared_quan_nvrtc_double.cpp ; 
 run test_inverse_chi_squared_quan_nvrtc_float.cpp ;
 
+run test_inverse_gamma_cdf_nvrtc_double.cpp ;
+run test_inverse_gamma_cdf_nvrtc_float.cpp ;
+run test_inverse_gamma_pdf_nvrtc_double.cpp ;
+run test_inverse_gamma_pdf_nvrtc_float.cpp ;
+run test_inverse_gamma_quan_nvrtc_double.cpp ;
+run test_inverse_gamma_quan_nvrtc_float.cpp ;
+
 run test_landau_cdf_nvrtc_double.cpp ;
 run test_landau_cdf_nvrtc_float.cpp ;
 run test_landau_pdf_nvrtc_double.cpp ;
diff --git a/test/test_inverse_gamma_cdf_nvrtc_double.cpp b/test/test_inverse_gamma_cdf_nvrtc_double.cpp
new file mode 100644
index 000000000..c5a4b9878
--- /dev/null
+++ b/test/test_inverse_gamma_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_cdf_nvrtc_float.cpp b/test/test_inverse_gamma_cdf_nvrtc_float.cpp
new file mode 100644
index 000000000..d76d51225
--- /dev/null
+++ b/test/test_inverse_gamma_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_pdf_nvrtc_double.cpp b/test/test_inverse_gamma_pdf_nvrtc_double.cpp
new file mode 100644
index 000000000..db2c8c4e1
--- /dev/null
+++ b/test/test_inverse_gamma_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_pdf_nvrtc_float.cpp b/test/test_inverse_gamma_pdf_nvrtc_float.cpp
new file mode 100644
index 000000000..4d552cf61
--- /dev/null
+++ b/test/test_inverse_gamma_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_quan_nvrtc_double.cpp b/test/test_inverse_gamma_quan_nvrtc_double.cpp
new file mode 100644
index 000000000..a49600bde
--- /dev/null
+++ b/test/test_inverse_gamma_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_quan_nvrtc_float.cpp b/test/test_inverse_gamma_quan_nvrtc_float.cpp
new file mode 100644
index 000000000..f71ed964a
--- /dev/null
+++ b/test/test_inverse_gamma_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From 44f82e1a39c029b30d4e742d7df89e3de68d05a9 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 15:52:54 -0400
Subject: [PATCH 30/31] Add CUDA testing of inverse gamma dist

---
 test/cuda_jamfile                      |   7 ++
 test/test_inverse_gamma_cdf_double.cu  | 110 +++++++++++++++++++++++++
 test/test_inverse_gamma_cdf_float.cu   | 110 +++++++++++++++++++++++++
 test/test_inverse_gamma_pdf_double.cu  | 110 +++++++++++++++++++++++++
 test/test_inverse_gamma_pdf_float.cu   | 110 +++++++++++++++++++++++++
 test/test_inverse_gamma_quan_double.cu | 110 +++++++++++++++++++++++++
 test/test_inverse_gamma_quan_float.cu  | 110 +++++++++++++++++++++++++
 7 files changed, 667 insertions(+)
 create mode 100644 test/test_inverse_gamma_cdf_double.cu
 create mode 100644 test/test_inverse_gamma_cdf_float.cu
 create mode 100644 test/test_inverse_gamma_pdf_double.cu
 create mode 100644 test/test_inverse_gamma_pdf_float.cu
 create mode 100644 test/test_inverse_gamma_quan_double.cu
 create mode 100644 test/test_inverse_gamma_quan_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 283267593..796d14a49 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -98,6 +98,13 @@ run test_inverse_chi_squared_pdf_float.cu ;
 run test_inverse_chi_squared_quan_double.cu ;
 run test_inverse_chi_squared_quan_float.cu ;
 
+run test_inverse_gamma_cdf_double.cu ;
+run test_inverse_gamma_cdf_float.cu ;
+run test_inverse_gamma_pdf_double.cu ;
+run test_inverse_gamma_pdf_float.cu ;
+run test_inverse_gamma_quan_double.cu ;
+run test_inverse_gamma_quan_float.cu ;
+
 run test_landau_cdf_double.cu ;
 run test_landau_cdf_float.cu ;
 run test_landau_pdf_double.cu ;
diff --git a/test/test_inverse_gamma_cdf_double.cu b/test/test_inverse_gamma_cdf_double.cu
new file mode 100644
index 000000000..4368a2284
--- /dev/null
+++ b/test/test_inverse_gamma_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_cdf_float.cu b/test/test_inverse_gamma_cdf_float.cu
new file mode 100644
index 000000000..cef2ec955
--- /dev/null
+++ b/test/test_inverse_gamma_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_pdf_double.cu b/test/test_inverse_gamma_pdf_double.cu
new file mode 100644
index 000000000..fa5073dbe
--- /dev/null
+++ b/test/test_inverse_gamma_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_pdf_float.cu b/test/test_inverse_gamma_pdf_float.cu
new file mode 100644
index 000000000..c2d80fe8d
--- /dev/null
+++ b/test/test_inverse_gamma_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_quan_double.cu b/test/test_inverse_gamma_quan_double.cu
new file mode 100644
index 000000000..c9095d752
--- /dev/null
+++ b/test/test_inverse_gamma_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_quan_float.cu b/test/test_inverse_gamma_quan_float.cu
new file mode 100644
index 000000000..3e60feaa1
--- /dev/null
+++ b/test/test_inverse_gamma_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}

From 89e1707f4ddf86b6b9c5fa8e214a6fa5a2d542bf Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 15:57:34 -0400
Subject: [PATCH 31/31] Update docs

---
 doc/distributions/fisher.qbk              | 14 ++++++++------
 doc/distributions/gamma.qbk               | 14 ++++++++------
 doc/distributions/geometric.qbk           | 16 +++++++++-------
 doc/distributions/inverse_chi_squared.qbk | 18 ++++++++++--------
 doc/distributions/inverse_gamma.qbk       | 14 ++++++++------
 5 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/doc/distributions/fisher.qbk b/doc/distributions/fisher.qbk
index 80c9a9b29..9b3a55f59 100644
--- a/doc/distributions/fisher.qbk
+++ b/doc/distributions/fisher.qbk
@@ -17,11 +17,11 @@
       typedef RealType value_type;
       
       // Construct:
-      fisher_f_distribution(const RealType& i, const RealType& j);
+      BOOST_MATH_GPU_ENABLED fisher_f_distribution(const RealType& i, const RealType& j);
       
       // Accessors:
-      RealType degrees_of_freedom1()const;
-      RealType degrees_of_freedom2()const;
+      BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom1()const;
+      BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom2()const;
    };
    
    }} //namespaces
@@ -46,7 +46,7 @@ two degrees of freedom parameters.
 
 [h4 Member Functions]
 
-      fisher_f_distribution(const RealType& df1, const RealType& df2);
+      BOOST_MATH_GPU_ENABLED fisher_f_distribution(const RealType& df1, const RealType& df2);
       
 Constructs an F-distribution with numerator degrees of freedom /df1/
 and denominator degrees of freedom /df2/.
@@ -54,11 +54,11 @@ and denominator degrees of freedom /df2/.
 Requires that /df1/ and /df2/ are both greater than zero, otherwise __domain_error
 is called.
       
-      RealType degrees_of_freedom1()const;
+      BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom1()const;
       
 Returns the numerator degrees of freedom parameter of the distribution.
 
-      RealType degrees_of_freedom2()const;
+      BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom2()const;
       
 Returns the denominator degrees of freedom parameter of the distribution.
 
@@ -66,6 +66,8 @@ Returns the denominator degrees of freedom parameter of the distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The domain of the random variable is \[0, +[infin]\].
 
diff --git a/doc/distributions/gamma.qbk b/doc/distributions/gamma.qbk
index eefcc84a0..5f9f0c2bf 100644
--- a/doc/distributions/gamma.qbk
+++ b/doc/distributions/gamma.qbk
@@ -12,10 +12,10 @@
       typedef RealType value_type;
       typedef Policy   policy_type;
 
-      gamma_distribution(RealType shape, RealType scale = 1)
+      BOOST_MATH_GPU_ENABLED gamma_distribution(RealType shape, RealType scale = 1)
 
-      RealType shape()const;
-      RealType scale()const;
+      BOOST_MATH_GPU_ENABLED RealType shape()const;
+      BOOST_MATH_GPU_ENABLED RealType scale()const;
    };
    
    }} // namespaces
@@ -76,7 +76,7 @@ a dedicated Erlang Distribution.
 
 [h4 Member Functions]
 
-   gamma_distribution(RealType shape, RealType scale = 1);
+   BOOST_MATH_GPU_ENABLED gamma_distribution(RealType shape, RealType scale = 1);
    
 Constructs a gamma distribution with shape /shape/ and 
 scale /scale/.
@@ -84,11 +84,11 @@ scale /scale/.
 Requires that the shape and scale parameters are greater than zero, otherwise calls
 __domain_error.
 
-   RealType shape()const;
+   BOOST_MATH_GPU_ENABLED RealType shape()const;
    
 Returns the /shape/ parameter of this distribution.
    
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
       
 Returns the /scale/ parameter of this distribution.
 
@@ -96,6 +96,8 @@ Returns the /scale/ parameter of this distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all
 distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The domain of the random variable is \[0,+[infin]\].
 
diff --git a/doc/distributions/geometric.qbk b/doc/distributions/geometric.qbk
index 7aa1a3343..038753d95 100644
--- a/doc/distributions/geometric.qbk
+++ b/doc/distributions/geometric.qbk
@@ -17,28 +17,28 @@
       typedef RealType value_type;
       typedef Policy   policy_type;
       // Constructor from success_fraction:
-      geometric_distribution(RealType p);
+      BOOST_MATH_GPU_ENABLED geometric_distribution(RealType p);
       
       // Parameter accessors:
-      RealType success_fraction() const;
-      RealType successes() const;
+      BOOST_MATH_GPU_ENABLED RealType success_fraction() const;
+      BOOST_MATH_GPU_ENABLED RealType successes() const;
      
       // Bounds on success fraction:
-      static RealType find_lower_bound_on_p(
+      BOOST_MATH_GPU_ENABLED static RealType find_lower_bound_on_p(
          RealType trials, 
          RealType successes,
          RealType probability); // alpha
-      static RealType find_upper_bound_on_p(
+      BOOST_MATH_GPU_ENABLED static RealType find_upper_bound_on_p(
          RealType trials, 
          RealType successes,
          RealType probability); // alpha
          
       // Estimate min/max number of trials:
-      static RealType find_minimum_number_of_trials(
+      BOOST_MATH_GPU_ENABLED static RealType find_minimum_number_of_trials(
          RealType k,     // Number of failures.
          RealType p,     // Success fraction.
          RealType probability); // Probability threshold alpha.
-      static RealType find_maximum_number_of_trials(
+      BOOST_MATH_GPU_ENABLED static RealType find_maximum_number_of_trials(
          RealType k,     // Number of failures.
          RealType p,     // Success fraction.
          RealType probability); // Probability threshold alpha.
@@ -268,6 +268,8 @@ of observing more than k failures.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions]
 that are generic to all distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 However it's worth taking a moment to define what these actually mean in 
 the context of this distribution:
diff --git a/doc/distributions/inverse_chi_squared.qbk b/doc/distributions/inverse_chi_squared.qbk
index 7bc75a881..8d67082d0 100644
--- a/doc/distributions/inverse_chi_squared.qbk
+++ b/doc/distributions/inverse_chi_squared.qbk
@@ -12,11 +12,11 @@
       typedef RealType value_type;
       typedef Policy   policy_type;
 
-      inverse_chi_squared_distribution(RealType df = 1); // Not explicitly scaled, default 1/df.
-      inverse_chi_squared_distribution(RealType df, RealType scale = 1/df);  // Scaled.
+      BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1); // Not explicitly scaled, default 1/df.
+      BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df, RealType scale = 1/df);  // Scaled.
 
-      RealType degrees_of_freedom()const; // Default 1.
-      RealType scale()const; // Optional scale [xi] (variance), default 1/degrees_of_freedom.
+      BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const; // Default 1.
+      BOOST_MATH_GPU_ENABLED RealType scale()const; // Optional scale [xi] (variance), default 1/degrees_of_freedom.
    };
    
    }} // namespace boost // namespace math
@@ -99,8 +99,8 @@ varies for a few values of parameters [nu] and [xi]:
 
 [h4 Member Functions]
 
-   inverse_chi_squared_distribution(RealType df = 1); // Implicitly scaled 1/df.
-   inverse_chi_squared_distribution(RealType df = 1, RealType scale); // Explicitly scaled.
+   BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1); // Implicitly scaled 1/df.
+   BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1, RealType scale); // Explicitly scaled.
 
 Constructs an inverse chi_squared distribution with [nu] degrees of freedom ['df],
 and scale ['scale] with default value 1\/df.
@@ -108,11 +108,11 @@ and scale ['scale] with default value 1\/df.
 Requires that the degrees of freedom [nu] parameter is greater than zero, otherwise calls
 __domain_error.
 
-   RealType degrees_of_freedom()const; 
+   BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const; 
    
 Returns the degrees_of_freedom [nu] parameter of this distribution.
 
-   RealType scale()const; 
+   BOOST_MATH_GPU_ENABLED RealType scale()const; 
    
 Returns the scale [xi] parameter of this distribution.
 
@@ -120,6 +120,8 @@ Returns the scale [xi] parameter of this distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all
 distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The domain of the random variate is \[0,+[infin]\].
 [note Unlike some definitions, this implementation supports a random variate 
diff --git a/doc/distributions/inverse_gamma.qbk b/doc/distributions/inverse_gamma.qbk
index 8fccbc19c..ee68651df 100644
--- a/doc/distributions/inverse_gamma.qbk
+++ b/doc/distributions/inverse_gamma.qbk
@@ -12,10 +12,10 @@
       typedef RealType value_type;
       typedef Policy   policy_type;
 
-      inverse_gamma_distribution(RealType shape, RealType scale = 1)
+      BOOST_MATH_GPU_ENABLED inverse_gamma_distribution(RealType shape, RealType scale = 1)
 
-      RealType shape()const;
-      RealType scale()const;
+      BOOST_MATH_GPU_ENABLED RealType shape()const;
+      BOOST_MATH_GPU_ENABLED RealType scale()const;
    };
    
    }} // namespaces
@@ -63,18 +63,18 @@ varies as the parameters vary:
 
 [h4 Member Functions]
 
-   inverse_gamma_distribution(RealType shape = 1, RealType scale = 1);
+   BOOST_MATH_GPU_ENABLED inverse_gamma_distribution(RealType shape = 1, RealType scale = 1);
    
 Constructs an inverse gamma distribution with shape [alpha] and scale [beta].
 
 Requires that the shape and scale parameters are greater than zero, otherwise calls
 __domain_error.
 
-   RealType shape()const;
+   BOOST_MATH_GPU_ENABLED RealType shape()const;
    
 Returns the [alpha] shape parameter of this inverse gamma distribution.
    
-   RealType scale()const;
+   BOOST_MATH_GPU_ENABLED RealType scale()const;
       
 Returns the [beta] scale parameter of this inverse gamma distribution.
 
@@ -82,6 +82,8 @@ Returns the [beta] scale parameter of this inverse gamma distribution.
 
 All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] that are generic to all
 distributions are supported: __usual_accessors.
+For this distribution all non-member accessor functions are marked with `BOOST_MATH_GPU_ENABLED` and can
+be run on both host and device.
 
 The domain of the random variate is \[0,+[infin]\].
 [note Unlike some definitions, this implementation supports a random variate