diff --git a/include/boost/math/fft.hpp b/include/boost/math/fft.hpp
index f7c695e573..62d815d53d 100644
--- a/include/boost/math/fft.hpp
+++ b/include/boost/math/fft.hpp
@@ -9,10 +9,10 @@
 #ifndef BOOST_MATH_FFT_HPP
   #define BOOST_MATH_FFT_HPP
 
+  #include <algorithm>
   #include <iterator>
-  //#include <boost/math/fft/fftw_backend.hpp>
-  //#include <boost/math/fft/gsl_backend.hpp>
-  #include <boost/math/fft/bsl_backend.hpp>
+  #include <vector>
+  #include <boost/math/fft/algorithms.hpp>
 
   namespace boost { namespace math { namespace fft {
 
@@ -36,7 +36,7 @@
   */
   
   template<typename RingType,
-           template<class U> class BackendType = bsl_dft >
+           template<class U> class BackendType>
            // typename Allocator = Default_Allocator 
   class dft : public BackendType<RingType>
   {
@@ -86,15 +86,15 @@
       OutputIteratorType out,
       typename std::enable_if<(   (std::is_convertible<InputIteratorType,  const RingType*>::value == true)
                                && (std::is_convertible<OutputIteratorType,       RingType*>::value == false))>::type* = nullptr)
-    {   
+    {
       resize(std::distance(in_first,in_last));
       my_mem.resize(size());
-      
+
       if(ex==execution_type::backward)
         backend_t::backward(in_first,my_mem.data());
       else
         backend_t::forward(in_first,my_mem.data());
-      
+
       std::copy(std::begin(my_mem), std::end(my_mem), out);
     }
 
@@ -110,7 +110,7 @@
       resize(std::distance(in_first,in_last));
       my_mem.resize(size());
       std::copy(in_first, in_last, std::begin(my_mem));
-      
+
       if(ex==execution_type::backward)
         backend_t::backward(my_mem.data(),my_mem.data());
       else
@@ -118,15 +118,18 @@
         
       std::copy(std::begin(my_mem),std::end(my_mem), out);
     }
-    
+
   public:
     using backend_t = BackendType<RingType>;
     using backend_t::size;
     using backend_t::resize;
 
+    // complex types ctor. n: the size of the dft
     constexpr dft(unsigned int n) : backend_t{ n } { }
-    
-    
+
+    // ring types ctor. n: the size of the dft, w: an n-root of unity
+    constexpr dft(unsigned int n, RingType w) : backend_t( n, w ) { }
+
     template<typename InputIteratorType,
              typename OutputIteratorType>
     void forward(
@@ -135,7 +138,7 @@
     {
       execute(execution_type::forward,in_first,in_last,out);
     }
-    
+
     template<typename InputIteratorType,
              typename OutputIteratorType>
     void backward(
@@ -147,7 +150,8 @@
   };
 
   // std::transform-like Fourier Transform API
-  template<template<class U> class backend = bsl_dft,
+  // for complex types
+  template<template<class U> class backend,
            typename InputIterator,
            typename OutputIterator>
   void dft_forward(InputIterator  input_begin,
@@ -155,18 +159,13 @@
                    OutputIterator output)
   {
     using input_value_type  = typename std::iterator_traits<InputIterator >::value_type;
-    //using output_value_type = typename std::iterator_traits<OutputIterator>::value_type;
-
-    //static_assert(std::is_same<input_value_type, output_value_type>::value,
-    //  "Input and output types mismatch");
-
-    dft<input_value_type, backend> plan(std::distance(input_begin, input_end));
-
+    dft<input_value_type, backend> plan(static_cast<unsigned int>(std::distance(input_begin, input_end)));
     plan.forward(input_begin, input_end, output);
   }
 
   // std::transform-like Fourier Transform API
-  template<template<class U> class backend = bsl_dft,
+  // for complex types
+  template<template<class U> class backend,
            class InputIterator,
            class OutputIterator>
   void dft_backward(InputIterator  input_begin,
@@ -174,16 +173,90 @@
                     OutputIterator output)
   {
     using input_value_type  = typename std::iterator_traits<InputIterator >::value_type;
-    //using output_value_type = typename std::iterator_traits<OutputIterator>::value_type;
-
-    //static_assert(std::is_same<input_value_type, output_value_type>::value, 
-    //  "Input and output types mismatch");
-
-    dft<input_value_type, backend> plan(std::distance(input_begin, input_end));
-
+    dft<input_value_type, backend> plan(static_cast<unsigned int>(std::distance(input_begin, input_end)));
     plan.backward(input_begin, input_end, output);
   }
+  
+  // std::transform-like Fourier Transform API
+  // for Ring types
+  template<template<class U> class backend,
+           typename InputIterator,
+           typename OutputIterator,
+           typename value_type>
+  void dft_forward(InputIterator  input_begin,
+                   InputIterator  input_end,
+                   OutputIterator output,
+                   value_type w)
+  {
+    using input_value_type  = typename std::iterator_traits<InputIterator >::value_type;
+    dft<input_value_type, backend> plan(static_cast<unsigned int>(std::distance(input_begin, input_end)),w);
+    plan.forward(input_begin, input_end, output);
+  }
 
+  // std::transform-like Fourier Transform API
+  // for Ring types
+  template<template<class U> class backend,
+           class InputIterator,
+           class OutputIterator,
+           typename value_type>
+  void dft_backward(InputIterator  input_begin,
+                    InputIterator  input_end,
+                    OutputIterator output,
+                    value_type w)
+  {
+    using input_value_type  = typename std::iterator_traits<InputIterator >::value_type;
+    dft<input_value_type, backend> plan(static_cast<unsigned int>(std::distance(input_begin, input_end)),w);
+    plan.backward(input_begin, input_end, output);
+  }
+  
+  template<template<class U> class backend,
+           typename InputIterator1,
+           typename InputIterator2,
+           typename OutputIterator>
+  void convolution(InputIterator1 input1_begin,
+                   InputIterator1 input1_end,
+                   InputIterator2 input2_begin,
+                   OutputIterator output)
+  {
+    using input_value_type  = typename std::iterator_traits<InputIterator1>::value_type;
+    using real_value_type = typename input_value_type::value_type;
+    
+    const long N = std::distance(input1_begin,input1_end);
+    const long N_extended = detail::is_power2(N) ? N : detail::upper_bound_power2(2*N-1);
+    
+    std::vector<input_value_type> In1(N_extended),In2(N_extended),Out(N_extended);
+    
+    std::copy(input1_begin,input1_end,In1.begin());
+    
+    InputIterator2 input2_end{input2_begin};
+    std::advance(input2_end,N);
+    std::copy(input2_begin,input2_end,In2.begin());
+    
+    // padding
+    for(long i=N;i<N_extended;++i)
+      In1[i]=In2[i]=input_value_type{0};
+    
+    // fake N-periodicity
+    if(N!=N_extended)
+    for(long i=1;i<N;++i)
+      In2[N_extended-N+i] = In2[i];
+    
+    dft<input_value_type, backend> plan(static_cast<unsigned int>(N_extended));
+    plan.forward(In1.begin(),In1.end(),In1.begin());
+    plan.forward(In2.begin(),In2.end(),In2.begin());
+    
+    // direct convolution
+    std::transform(In1.begin(),In1.end(),In2.begin(),Out.begin(),std::multiplies<input_value_type>()); 
+    
+    plan.backward(Out.begin(),Out.end(),Out.begin());
+    
+    const real_value_type inv_N = real_value_type{1}/N_extended;
+    for(auto & x : Out)
+        x *= inv_N;
+    
+    std::copy(Out.begin(),Out.begin() + N,output);
+  }
+  
   } } } // namespace boost::math::fft
 
 #endif // BOOST_MATH_FFT_HPP
diff --git a/include/boost/math/fft/abstract_ring.hpp b/include/boost/math/fft/abstract_ring.hpp
index 89041ccba6..e61db6923d 100644
--- a/include/boost/math/fft/abstract_ring.hpp
+++ b/include/boost/math/fft/abstract_ring.hpp
@@ -27,6 +27,7 @@
     #include <boost/multiprecision/complex128.hpp>
   #endif
   #include <boost/multiprecision/complex_adaptor.hpp>
+  #include <boost/math/constants/constants.hpp>
 
   /*
     RingType axioms:
@@ -50,6 +51,27 @@
   */
 
   namespace boost { namespace math {  namespace fft {  namespace detail {
+  
+    #if __cplusplus < 201700L
+    template<typename ...> using void_t = void;
+    #else
+    using std::void_t;
+    #endif
+    
+
+    template<class T, typename = void_t<> >
+    struct is_complex : std::false_type
+    {};
+
+    template<class T >
+    struct is_complex<T, 
+        void_t<
+            typename T::value_type,
+            decltype(sin(std::declval<typename T::value_type>())),
+            decltype(cos(std::declval<typename T::value_type>()))
+         > > : std::true_type
+    {};
+    
 
   // Recognizes:
   // → fundamental types
diff --git a/include/boost/math/fft/algorithms.hpp b/include/boost/math/fft/algorithms.hpp
index 74beb039cf..3df6c50d86 100644
--- a/include/boost/math/fft/algorithms.hpp
+++ b/include/boost/math/fft/algorithms.hpp
@@ -14,81 +14,87 @@
   #include <cmath>
   #include <vector>
   #include <boost/math/constants/constants.hpp>
+  #include <boost/math/fft/abstract_ring.hpp>
+  #include <boost/math/fft/discrete_maths.hpp>
+  
 
 
   namespace boost { namespace math {  namespace fft {
+  
+  template<template<class U> class backend,
+           typename InputIterator1,
+           typename InputIterator2,
+           typename OutputIterator>
+  void convolution(InputIterator1 input1_begin,
+                   InputIterator1 input1_end,
+                   InputIterator2 input2_begin,
+                   OutputIterator output);
+  
+  template<class RingType>
+  class bsl_dft;
 
   namespace detail {
-
-  // TODO: use another power function
-  template <class T>
-  T power(const T& x, int n)
-  {
-    /*
-        for our use case n should always be >0.
-        However, if n==0 we would expect something
-        like 1.
-    */
-    // TODO: decide which exceptions to throw
-    if (n < 1)
-        throw std::domain_error("power(x,n) expects n>1.");
-
-    bool identity = true;
-    T r{}, aux{x};
-
-    for (; n; n >>= 1)
-    {
-      if (n & 1)
-      {
-        r = identity ? aux : r * aux;
-        identity = false;
-      }
-      aux *= aux;
-    }
-    return r;
-  }
   
-  long least_power2(long x)
+  template<class ComplexType>
+  ComplexType complex_root_of_unity(long n,long p=1)
+  /*
+    Computes exp(-i 2 pi p/n)
+  */
   {
-    /*
-      Returns the biggest power of two that is smaller or equal than x.
-    */
-    for(long lsb = x & -x; lsb!=x;)
+    using real_value_type = typename ComplexType::value_type;
+    p = modulo(p,n);
+    
+    if(p==0)
+      return ComplexType(1,0);
+    
+    long g = gcd(p,n); 
+    n/=g;
+    p/=g;
+    switch(n)
     {
-      x-=lsb;
-      lsb = x&-x;
+      case 1:
+        return ComplexType(1,0);
+      case 2:
+        return p==0 ? ComplexType(1,0) : ComplexType(-1,0);
+      case 4:
+        return p==0 ? ComplexType(1,0) : 
+               p==1 ? ComplexType(0,-1) :
+               p==2 ? ComplexType(-1,0) :
+                      ComplexType(0,1) ;
     }
-    return x;
+    using std::sin;
+    using std::cos;
+    real_value_type phase = -2*p*boost::math::constants::pi<real_value_type>()/n;
+    return ComplexType(cos(phase),sin(phase));
   }
-    
-  std::vector<long> prime_factorization(long n)
+  template<class ComplexType>
+  ComplexType complex_inverse_root_of_unity(long n,long p=1)
+  /*
+    Computes exp(i 2 pi p/n)
+  */
   {
-    std::vector<long> F;
-    for (long x = 2; x * x <= n;)
-      if (n % x == 0)
-      {
-        F.push_back(x);
-        n /= x;
-      }
-      else
-      {
-        ++x;
-      }
-    if (n > 1)
-      F.push_back(n);
-    return F;
+    return complex_root_of_unity<ComplexType>(n,-p);
   }
   
-  bool is_power2(long x) { return x == (x & -x);}
+  template<class complex_value_type>
+  inline void complex_dft_2(
+    const complex_value_type* in, 
+    complex_value_type* out, int)
+  {
+    complex_value_type 
+        o1 = in[0]+in[1], o2 = in[0]-in[1] ;
+    out[0] = o1;
+    out[1] = o2;
+  }
   
   template<class T>
-  void dft_generic_prime_bruteForce(const T* in_first, const T* in_last, T* out, const T w)
+  void dft_prime_bruteForce(const T* in_first, const T* in_last, T* out, const T w)
   /*
     assumptions: 
     - allocated memory in out is enough to hold distance(in_first,in_last) element,
   */
   {
-    const long N = std::distance(in_first,in_last);
+    const long N = static_cast<long>(std::distance(in_first,in_last));
     if(N<=0)
       return;
     
@@ -120,7 +126,7 @@
   }
   
   template<class complex_value_type>
-  void dft_complex_prime_bruteForce(
+  void complex_dft_prime_bruteForce(
     const complex_value_type* in_first, 
     const complex_value_type* in_last, 
     complex_value_type* out, int sign)
@@ -129,10 +135,7 @@
     - allocated memory in out is enough to hold distance(in_first,in_last) element,
   */
   {
-    using real_value_type = typename complex_value_type::value_type;
-    const long N = std::distance(in_first,in_last);
-    const real_value_type inv_N = real_value_type{1}/N;
-    auto signed_pi = sign * boost::math::constants::pi<real_value_type>();
+    const long N = static_cast<long>(std::distance(in_first,in_last));
     if(N<=0)
       return;
     
@@ -153,12 +156,7 @@
       complex_value_type sum{in_first[0]};
       for(long j=1;j<N; ++j)
       {
-        real_value_type phase = 2*signed_pi * i *j * inv_N;
-
-        using std::cos;
-        using std::sin;
-
-        sum += in_first[j] * complex_value_type{cos(phase),sin(phase)};
+        sum += in_first[j] * complex_root_of_unity<complex_value_type>(N,i*j*sign);
       }
       work_space[i] = sum;
     }
@@ -167,66 +165,117 @@
       std::copy(work_space,work_space+N,out);
   }
   
+  /*
+    Rader's FFT on prime sizes
+  */
+  template<class complex_value_type>
+  void complex_dft_prime_rader(
+    const complex_value_type *in_first, 
+    const complex_value_type *in_last, 
+    complex_value_type* out, int sign)
+  // precondition: distance(in_first,in_last) is prime > 2
+  {
+    const long my_n = static_cast<long>(std::distance(in_first,in_last));
+    
+    std::vector<complex_value_type> A(my_n-1),W(my_n-1),B(my_n-1);
+    
+    const long g = primitive_root(my_n);
+    const long g_inv = power_mod(g,my_n-2,my_n);
+    
+    for(long i=0;i<my_n-1;++i)
+    {
+      W[i] = complex_root_of_unity<complex_value_type>(my_n,sign*power_mod(g_inv,i,my_n));
+      A[i] = in_first[ power_mod(g,i+1,my_n) ];
+    }
+    
+    ::boost::math::fft::convolution<bsl_dft>(A.begin(),A.end(),W.begin(),B.begin());
+    
+    complex_value_type a0 = in_first[0];
+    complex_value_type sum_a {a0};
+    for(long i=1;i<my_n;++i)
+        sum_a += in_first[i];
+    
+    out[0] = sum_a;
+    for(long i=1;i<my_n;++i)
+    {
+      out[i]=a0;
+    }
+    for(long i=1;i<my_n;++i)
+    {
+      out[ power_mod(g_inv,i,my_n) ] += B[i-1];
+    }
+  }
+  
+  
   template <class T>
-  void dft_power2_dit(const T *in_first, const T *in_last, T* out, const T e)
+  void dft_composite(const T *in_first, const T *in_last, T* out, const T e)
   {
     /*
-      Cooley-Tukey mapping, in-place Decimation in Time 
+      Cooley-Tukey mapping, intrinsically out-of-place, Decimation in Time
+      composite sizes.
     */
-    const long ptrdiff = std::distance(in_first,in_last);
-    if(ptrdiff <=0 )
+    const long n = static_cast<unsigned int>(std::distance(in_first,in_last));
+    if(n <=0 )
       return;
-    const long n = least_power2(ptrdiff);
-    
-    if(in_first!=out)
-      std::copy(in_first,in_last,out);
     
     if (n == 1)
+    {
+        out[0]=in_first[0];
         return;
-
-    auto _1 = T{1};
+    }
+    auto prime_factors = prime_factorization(n);
     
-    int nbits = 0;
-    std::vector<T> e2{e};
-    for (int m = n / 2; m > 0; m >>= 1, ++nbits)
-      e2.push_back(e2.back() * e2.back());
-
-    std::reverse(e2.begin(), e2.end());
-
-    // Gold-Rader bit-reversal algorithm.
-    for(int i=0,j=0;i<n-1;++i)
-    { 
-      if(i<j)
-        std::swap(out[i],out[j]);
-      for(int k=n>>1;!( (j^=k)&k );k>>=1);
+    // reorder input
+    for (long i = 0; i < n; ++i)
+    {
+        long j = 0, k = i;
+        for (auto p : prime_factors)
+        {
+            j = j * p + k % p;
+            k /= p;
+        }
+        out[j] = in_first[i];
     }
     
+    std::reverse(prime_factors.begin(), prime_factors.end());
     
-    for (int len = 2, k = 1; len <= n; len <<= 1, ++k)
+    // butterfly pattern
+    long len = 1;
+    for (auto p : prime_factors)
     {
-      for (int i = 0; i < n; i += len)
+      long len_old = len;
+      len *= p;
+      T w_len = power(e, n / len);
+      T w_p = power(e,n/p);
+      
+      std::vector<T> tmp(p);
+      for (long i = 0; i < n; i += len)
       {
-        T ej = _1;
-        for (int j = 0; j < len / 2; ++j)
+        for(long k=0;k<len_old;++k)
         {
-          T* u = out + i + j, *v = out + i + j + len / 2;
-          T Bu = *u, Bv = *v * ej;
-          *u = Bu + Bv;
-          *v = Bu - Bv;
-          ej *= e2[k];
+          for(long j=0;j<p;++j)
+            if(j==0 || k==0)
+              tmp[j] = out[i + j*len_old +k ];
+            else
+              tmp[j] = out[i + j*len_old +k ] * power(w_len,k*j);
+          
+          dft_prime_bruteForce(tmp.data(),tmp.data()+p,tmp.data(),w_p);
+          
+          for(long j=0;j<p;++j)
+            out[i+ j*len_old + k] = tmp[j];
         }
       }
     }
   }
   
-  template <class T>
-  void dft_composite_dit(const T *in_first, const T *in_last, T* out, const T e)
+  template <class ComplexType>
+  void complex_dft_composite(const ComplexType *in_first, const ComplexType *in_last, ComplexType* out, int sign)
   {
     /*
       Cooley-Tukey mapping, intrinsically out-of-place, Decimation in Time
       composite sizes.
     */
-    const long n = std::distance(in_first,in_last);
+    const long n = static_cast<long>(std::distance(in_first,in_last));
     if(n <=0 )
       return;
     
@@ -257,22 +306,25 @@
     {
       long len_old = len;
       len *= p;
-      T w_len = power(e, n / len);
-      T w_p = power(e,n/p);
       
-      std::vector<T> tmp(p);
+      std::vector<ComplexType> tmp(p);
       for (long i = 0; i < n; i += len)
       {
         for(long k=0;k<len_old;++k)
         {
           for(long j=0;j<p;++j)
-            if(j==0 or k==0)
+            if(j==0 || k==0)
               tmp[j] = out[i + j*len_old +k ];
             else
-              tmp[j] = out[i + j*len_old +k ] * power(w_len,k*j);
-          
-          dft_generic_prime_bruteForce(tmp.data(),tmp.data()+p,tmp.data(),w_p);
+              tmp[j] = out[i + j*len_old +k ] * complex_root_of_unity<ComplexType>(len,k*j*sign);
           
+          if(p==2)
+            complex_dft_2(tmp.data(),tmp.data(),sign);
+          else
+          {
+          //  complex_dft_prime_bruteForce(tmp.data(),tmp.data()+p,tmp.data(),sign);
+            complex_dft_prime_rader(tmp.data(),tmp.data()+p,tmp.data(),sign);
+          }
           for(long j=0;j<p;++j)
             out[i+ j*len_old + k] = tmp[j];
         }
@@ -280,17 +332,77 @@
     }
   }
   
+  template <class T>
+  void dft_power2(const T *in_first, const T *in_last, T* out, const T e)
+  {
+    /*
+      Cooley-Tukey mapping, in-place Decimation in Time 
+    */
+    const long ptrdiff = static_cast<long>(std::distance(in_first,in_last));
+    if(ptrdiff <=0 )
+      return;
+    const long n = lower_bound_power2(ptrdiff);
+    
+    if(in_first!=out)
+      std::copy(in_first,in_last,out);
+    
+    if (n == 1)
+        return;
+
+    // auto _1 = T{1};
+    
+    int nbits = 0;
+    std::vector<T> e2{e};
+    for (int m = n / 2; m > 0; m >>= 1, ++nbits)
+      e2.push_back(e2.back() * e2.back());
+
+    std::reverse(e2.begin(), e2.end());
+
+    // Gold-Rader bit-reversal algorithm.
+    for(int i=0,j=0;i<n-1;++i)
+    { 
+      if(i<j)
+        std::swap(out[i],out[j]);
+      for(int k=n>>1;!( (j^=k)&k );k>>=1);
+    }
+    
+    
+    for (int len = 2, k = 1; len <= n; len <<= 1, ++k)
+    {
+      for (int i = 0; i < n; i += len)
+      {
+        {
+          int j=0;
+          T* u = out + i + j, *v = out + i + j + len / 2;
+          T Bu = *u, Bv = *v;
+          *u = Bu + Bv;
+          *v = Bu - Bv;
+        }
+        
+        T ej = e2[k];
+        for (int j = 1; j < len / 2; ++j)
+        {
+          T* u = out + i + j, *v = out + i + j + len / 2;
+          T Bu = *u, Bv = *v * ej;
+          *u = Bu + Bv;
+          *v = Bu - Bv;
+          ej *= e2[k];
+        }
+      }
+    }
+  }
+  
   template<class complex_value_type>
-  void dft_power2_dif(
+  void complex_dft_power2(
     const complex_value_type *in_first, 
     const complex_value_type *in_last, 
     complex_value_type* out, int sign)
   {
     // Naive in-place complex DFT.
-    const long ptrdiff = std::distance(in_first,in_last);
+    const long ptrdiff = static_cast<long>(std::distance(in_first,in_last));
     if(ptrdiff <=0 )
       return;
-    const long my_n = least_power2(ptrdiff);
+    const long my_n = lower_bound_power2(ptrdiff);
     
     if(in_first!=out)
       std::copy(in_first,in_last,out);
@@ -301,23 +413,14 @@
     if(in_first!=out)
       std::copy(in_first, in_last, out);
 
-    using local_float_type = typename complex_value_type::value_type;
-    const local_float_type signed_pi = sign * boost::math::constants::pi<local_float_type>();
     // Recursive decimation in frequency.
     for(long m = my_n; m > 1; m /= 2)
     {
       long mh = m / 2;
-
-      const local_float_type phi = signed_pi / mh;
-
+      
       for(long j = 0; j < mh; ++j)
       {
-        const local_float_type p = phi * j;
-
-        using std::cos;
-        using std::sin;
-
-        complex_value_type cs(cos(p), sin(p));
+        complex_value_type cs{complex_root_of_unity<complex_value_type>(m,j*sign)};
 
         for (long t1=j; t1 < j + my_n; t1 += m)
         {
@@ -346,6 +449,7 @@
 
     // Normalize for backwards transform (done externally).
   }
+  
 
   } // namespace detail
 
diff --git a/include/boost/math/fft/bsl_backend.hpp b/include/boost/math/fft/bsl_backend.hpp
index 52cac0b9dd..390393c243 100644
--- a/include/boost/math/fft/bsl_backend.hpp
+++ b/include/boost/math/fft/bsl_backend.hpp
@@ -12,61 +12,195 @@
   #include <algorithm>
   #include <cmath>
   #include <vector>
+  #include <exception>
+  #include <type_traits>
 
   #include <boost/math/fft/algorithms.hpp>
   #include <boost/math/fft/abstract_ring.hpp>
 
   namespace boost { namespace math {  namespace fft {
 
-  template<class NativeComplexType>
+
+  /*
+    Boost DFT backend:
+    It handles RingTypes and it calls the appropriate specialized functions if
+    the type is complex.
+    
+    A type is considered "complex" if is_complex::value == true. A user-defined
+    type can become complex by specializing that trait.
+    
+    We have specialized algorithms for complex numbers and general purpose DFT
+    that need the specification of a root of unity.
+    The general purpose DFT work with complex and non-complex types, but its
+    performance and precision could be lower than the specialized complex
+    versions.
+    
+    This interface selects general purpose DFT for non-complex types,
+    and for complex types the default behaviour is to use the specialized
+    complex algorithms, unless the user provides a root of unity 'W', in which case
+    the interface will execute general purpose DFT using W.
+  */
+  template<class RingType>
   class bsl_dft
   {
   private:
-    using real_value_type    = typename NativeComplexType::value_type;
-    using complex_value_type = typename detail::select_complex<real_value_type>::type;
     enum plan_type { forward_plan , backward_plan};
-    //static constexpr real_value_type pi = boost::math::constants::pi<real_value_type>();
     
-    void execute(plan_type plan, const complex_value_type * in, complex_value_type* out)const
+    template<typename U = RingType>
+    typename std::enable_if< detail::is_complex<U>::value==true  >::type
+    execute(plan_type plan, const RingType * in, RingType* out)const
+    {
+      const long N = static_cast<long>(size());
+      // select the implementation according to the type
+      if((has_root() == false) && detail::is_complex<RingType>::value)
+      {
+          const int sign = (plan == forward_plan ? 1 : -1);
+          // select the implementation according to the DFT size
+          
+          switch(N)
+          {
+            case 0:
+              return;
+            case 1:
+              out[0]=in[0];
+              return;
+            case 2:
+              detail::complex_dft_2(in,out,sign);
+              return;
+          }
+          
+          if( detail::is_power2(N) )
+          {
+            detail::complex_dft_power2(in,in+N,out,sign);
+          }
+          else if(detail::is_prime(N))
+          {
+            // detail::complex_dft_prime_bruteForce(in,in+N,out,sign);
+            detail::complex_dft_prime_rader(in,in+N,out,sign);
+          }
+          else
+          {
+            detail::complex_dft_composite(in,in+N,out,sign);
+          }
+      }else
+      {
+        const RingType w_execute = (plan==forward_plan ? root() : inverse_root());
+        
+        // select the implementation according to the DFT size
+        if( detail::is_power2(N))
+        {
+          detail::dft_power2(in,in+N,out,w_execute);
+        }
+        else
+        {
+          detail::dft_composite(in,in+N,out,w_execute);
+        }
+      }
+      
+    }
+    
+    template<typename U = RingType>
+    typename std::enable_if< detail::is_complex<U>::value==false  >::type
+    execute(plan_type plan, const RingType * in, RingType* out)const
     {
-      const int sign = (plan == forward_plan ? -1 : 1);
+      const RingType w_execute = (plan==forward_plan ? root() : inverse_root());
+      
       // select the implementation according to the DFT size
-      if( detail::is_power2(size())  )
-        detail::dft_power2_dif(in,in+size(),out,sign);
+      if( detail::is_power2(static_cast<long>(size())))
+      {
+        detail::dft_power2(in,in+size(),out,w_execute);
+      }
       else
-      { 
-        // NativeComplexType w{std::cos(2*pi/size()), sign * std::sin(2*pi/size())};
-        detail::dft_complex_prime_bruteForce(in,in+size(),out,sign);
+      {
+        detail::dft_composite(in,in+size(),out,w_execute);
       }
     }
     
   public:
-    constexpr bsl_dft(std::size_t n)
-      : my_size { n }
-    { }
+    constexpr bsl_dft(std::size_t n=0):
+        my_size{n}
+    {
+    }
+    
+    // the provided root of unity is used instead of exp(-i 2 pi/n)
+    constexpr bsl_dft(std::size_t n, RingType /* root of unity = */ w):
+        my_size{n}, _has_root{true}, my_root{w},
+        my_inverse_root{ my_size <=1 ? my_root : detail::power(my_root,my_size-1)}
+    { 
+    }
 
     ~bsl_dft()
     {
     }
     
+    bool has_root()const
+    {
+      return _has_root;
+    }
+    
     void resize(std::size_t new_size)
     {
       my_size = new_size;
+      _has_root = false;
+    }
+    void resize(std::size_t new_size, RingType w)
+    {
+      my_size = new_size;
+      _has_root = true;
+      my_root = w;
+      if(new_size<=1)
+        my_inverse_root = my_root;
+      else  
+        my_inverse_root = detail::power(my_root,new_size-1);
+    }
+    
+    // non complex types
+    template<typename U = RingType>
+    RingType root(typename std::enable_if< detail::is_complex<U>::value == false >::type* = nullptr) const
+    {
+      if(has_root() == false)
+        std::runtime_error("no root has been defined for this DFT size");
+      return my_root;
     }
+    template<typename U = RingType>
+    RingType inverse_root(typename std::enable_if< detail::is_complex<U>::value == false >::type* = nullptr) const
+    {
+      if(has_root() == false)
+        std::runtime_error("no root has been defined for this DFT size");
+      return my_inverse_root;
+    }
+    // complex types
+    template<typename U = RingType>
+    RingType root(typename std::enable_if< detail::is_complex<U>::value == true >::type* = nullptr) const
+    {
+      if(has_root())
+        return my_root;
+      return detail::complex_root_of_unity<RingType>(static_cast<long>(size()));
+    }
+    template<typename U = RingType>
+    RingType inverse_root(typename std::enable_if< detail::is_complex<U>::value == true >::type* = nullptr) const
+    {
+      if(has_root())
+        return my_root;
+      return detail::complex_inverse_root_of_unity<RingType>(static_cast<long>(size()));
+    }
+    
     constexpr std::size_t size() const { return my_size; }
 
-    void forward(const complex_value_type* in, complex_value_type* out) const
+    void forward(const RingType* in, RingType* out) const
     {
       execute(forward_plan,in,out);   
     }
 
-    void backward(const complex_value_type* in, complex_value_type* out) const
+    void backward(const RingType* in, RingType* out) const
     {
       execute(backward_plan,in,out);   
     }
 
   private:
-    std::size_t my_size;
+    std::size_t my_size{};
+    bool _has_root=false;
+    RingType my_root, my_inverse_root;
   };
 
   } } } // namespace boost::math::fft
diff --git a/include/boost/math/fft/discrete_maths.hpp b/include/boost/math/fft/discrete_maths.hpp
new file mode 100644
index 0000000000..87619efa0b
--- /dev/null
+++ b/include/boost/math/fft/discrete_maths.hpp
@@ -0,0 +1,374 @@
+///////////////////////////////////////////////////////////////////
+//  Copyright Eduardo Quintana 2021
+//  Copyright Janek Kozicki 2021
+//  Copyright Christopher Kormanyos 2021
+//  Distributed under the Boost Software License,
+//  Version 1.0. (See accompanying file LICENSE_1_0.txt
+//  or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <algorithm>
+#include <limits>
+#include <type_traits>
+
+#ifndef BOOST_MATH_FFT_DISCRETE_HPP
+  #define BOOST_MATH_FFT_DISCRETE_HPP
+  
+  namespace boost { namespace math {  namespace fft {
+  namespace detail {
+  
+  template <typename integer>
+  integer modulo(integer a, integer b)
+  // it computes 'a mod b' in the mathematical sense, not 'a%b'
+  // it can handle a<0 values
+  // precondition: b>0
+  {
+    return ( a%b + b ) % b;
+  }
+  
+  template <typename integer>
+  struct euclid_return
+  // helper class for the extended euclidean algorithm
+  {
+    integer gcd, x, y;
+  };
+  
+  template <typename integer>
+  euclid_return<integer> extended_euclid(integer a, integer b)
+  // extended euclidean algorithm. It solves Bezout equation for ANY integer
+  // pair (a,b):
+  // g = a x  + b y
+  {
+    using std::swap;
+    static_assert(std::numeric_limits<integer>::is_signed,
+                  "Signed integers are needed");
+    integer x = 1, y = 0, g = a;
+    for (integer x1 = 0, y1 = 1, g1 = b; g1;)
+    {
+      integer q = g / g1;
+      x = x - x1 * q;
+      swap(x, x1);
+      y = y - y1 * q;
+      swap(y, y1);
+      g = g - g1 * q;
+      swap(g, g1);
+    }
+    return {g, x, y};
+  }
+  
+  template <typename integer>
+  integer gcd(integer a, integer b)
+  // Greatest Commond Divisor
+  {
+    euclid_return<integer> ret = extended_euclid(a,b);
+    return ret.gcd;
+  }
+  
+  template <typename T,typename integer>
+  T power(const T& x, integer n)
+    // WARNING: 
+    // -> for n<0 we cannot decide the outcome, because we don't know the inverse of x
+    // -> for n=0 we cannot decide the outcome, because we don't know the
+    // multiplication neutral element of T.
+  // precondition: n>0
+  {
+    if(n<=0)
+      return x;
+      
+    bool identity = true;
+    T r{x};
+
+    for (T aux{x}; n; n /= 2)
+    {
+      if (n % 2)
+      {
+        r = identity ? aux : r * aux;
+        identity = false;
+      }
+      aux *= aux;
+    }
+    return r;
+  }
+  
+  template<typename integer>
+  constexpr 
+  typename std::enable_if<std::is_integral<integer>::value,bool>::type 
+  // enabled only for builtin integer types
+  is_power2(integer x) 
+  { 
+    return (x>0) && (x == (x & -x));
+  }
+  
+  template<typename integer>
+  typename std::enable_if<std::is_integral<integer>::value,integer>::type 
+  // enabled only for builtin integer types
+  lower_bound_power2(integer x)
+  // Returns the biggest power of two that is smaller than or equal to x.
+  // returns 1 if no such power of two exists, ie. for x<=0
+  {
+    if(x<=0)
+      return 0;
+      
+    for(integer lsb = x & -x; lsb!=x;)
+    {
+      x-=lsb;
+      lsb = x&-x;
+    }
+    return x;
+  }
+  
+  
+  template<typename integer>
+  typename std::enable_if<std::is_integral<integer>::value,integer>::type 
+  // enabled only for builtin integer types
+  upper_bound_power2(integer x)
+  // Returns the smallest power of two that is greater or equal to x.
+  // returns 0 if no such power of two exits, ie. when the value overflows
+  // the integer
+  {
+    if(x<=1)
+      return 1;
+    
+    const integer up_max = lower_bound_power2(
+      std::numeric_limits<integer>::max() );
+    if(up_max<x)
+      return 0;
+    
+    integer up=1;
+    for(;up<x;up<<=1);
+    return up;
+  }
+  template <typename integer, typename Exp>
+  bool power_greater(integer A, integer B, Exp p)
+    // precondition: A,B,p > 0
+    // returns true if A^p > B
+  {
+    if (A == 1)
+      return 1 > B;
+    if (p > std::numeric_limits<integer>::digits)
+      return true;
+
+    while (p--)
+      B /= A;
+    return 1 > B;
+  }
+  template <typename integer, typename exp_t>
+  integer root(const integer N, const exp_t p)
+    // binary search to find the biggest r such that r^p <= N,
+    // ie. r = floor( pRoot(N)  )
+  {
+    if (p == 1)
+      return N;
+    // no overflow
+    integer down = 1, up = N;
+    while (up - down > 1)
+    {
+      integer mid = down + (up - down) / 2;
+      if (power_greater(mid, N, p))
+        up = mid;
+      else
+        down = mid;
+    }
+    return down;
+  }
+  template <typename integer>
+  class mint
+  /*
+    Helper class for modular arithmetics,
+    with modulo known at runtime.
+    WARNING: the big flaw is that there are no checks for the homogeneity
+    of the modulo, ie. you could a+b where a.mod != b.mod
+  */
+  {
+    static bool overflow_by_multiplication(integer m)
+    // checks if the give modular base 'm' can overflow the direct
+    // 'integer' multiplication
+    {
+      integer sqrt_max = root(std::numeric_limits<integer>::max(), 2);
+      return m - 1 > sqrt_max;
+    }
+  
+    const integer mod{};
+    integer x{};
+
+   public:
+    mint(){}
+    mint(integer m,integer _x) :  mod{m}, x{_x}
+    {
+      x = modulo(x,mod);
+    }
+    mint(const mint& that) : mod{that.mod}, x{that.x} {}
+
+    mint& operator=(const mint& that) { return x = that.x, *this; }
+
+    explicit operator bool() const { return x == integer{0}; }
+    operator integer() const { return x; }
+
+    mint& operator++() { return x = (x + 1) % mod, *this; }
+    mint& operator--() { return x = (mod + x - 1) % mod, *this;}
+    mint operator++(int)
+    { /*post*/
+      mint temp(*this);
+      this->operator++();
+      return temp;
+    }
+    mint operator--(int)
+    { /*post*/
+      mint temp(*this);
+      this->operator--();
+      return temp;
+    }
+    mint& operator+=(const mint& that){return x = (x + that.x) % mod, *this;} 
+    mint& operator-=(const mint& t){return x = (mod + x - t.x) % mod, *this;}
+
+    mint& operator*=(const mint& t)
+    {
+      if (overflow_by_multiplication(mod))
+      {
+        /// multiplication based on sums
+        integer res{0};
+        for (integer a = x, b = t.x; b; b /= 2)
+        {
+          if (b % 2)
+            res = (res + a) % mod;
+          a = (a + a) % mod;
+        }
+        x = res;
+        return *this;
+      }
+      // direct multiplication
+      x = (x * t.x) % mod;
+      return *this;
+    }
+
+    mint<integer> inverse() const
+    {
+        euclid_return<integer> ee = extended_euclid(x, mod);
+        if (ee.gcd != 1)
+            return mint<integer>(0,0); // no inverse found
+        integer inv_x = ee.x;
+        return mint(mod,inv_x);
+        // return power(*this,mod-2);
+    }
+    bool operator==(const mint& that) const { return x == that.x; }
+    bool operator<(const mint& that) const { return x < that.x; }
+  };
+  template <typename T>
+  mint<T> operator*(const mint<T>& A, const mint<T>& B)
+  {
+    mint<T> C{A};
+    return C *= B;
+  }
+  
+  template<typename integer>
+  integer power_mod(integer x, integer n, integer m)
+  // Computes x^n mod m
+  {
+    if(x==0 || x==1)
+        return x;
+    if(n<=0)
+      return 1;
+    mint<integer> xm(m,x);
+    // n = modulo(n,euler_phi(m));
+    xm = power(xm,n);
+    return integer(xm);
+  }
+  
+  inline bool is_prime(int n)
+  {
+    // Naive O(sqrt(n)) prime decision
+    // TODO: replace with something more sophisticated like Rabin-Miller's test
+    
+    if(n<2)
+      return false;
+    if(n==2 || n==3)
+      return true;
+    
+    const int sqrt_n = root(n,2);
+    for(int x=2;x<=sqrt_n;++x)
+    {
+      if(n%x == 0)
+        return false;
+    }
+    return true;
+  }
+  
+  inline std::vector<int> prime_factorization(int n)
+  // Naive O(sqrt(n)) prime factorization.
+  // returns a list of prime numbers that when multiplied they give n
+  {
+    std::vector<int> F;
+    
+    const int sqrt_n = root(n,2);
+    for (int x = 2; x <= sqrt_n;)
+      if (n % x == 0)
+      {
+        F.push_back(x);
+        n /= x;
+      }
+      else
+      {
+        ++x;
+      }
+    if (n > 1)
+      F.push_back(n);
+    return F;
+  }
+  
+  inline std::vector<int> prime_factorization_unique(int n)
+  // returns the list of unique prime factors divisors of n
+  {
+    std::vector<int> F{prime_factorization(n)};
+    std::sort(F.begin(),F.end());
+    std::vector<int> F_unique;
+    int last=-1;
+    for(auto p: F)
+    {
+      if(p==last)
+        continue;
+      
+      last=p;
+      F_unique.push_back(p);
+    }
+    return F_unique;
+  }
+ 
+  inline int euler_phi(int n)
+  // Euler Phi function
+  {
+    int r = n;
+    std::vector<int> F = prime_factorization_unique(n);
+    for (auto p : F)
+        r -= r / p;
+    return r;
+  }
+  
+  inline int primitive_root(int n)
+  // it finds a primitive root r of n,
+  // ie. r^phi(n) = 1 mod n
+  {
+    const int phi = euler_phi(n);
+    std::vector<int> F = prime_factorization_unique(phi);
+    for(int i=1;i<n;++i)
+    {
+      int g = gcd(i,n);
+      if(g!=1) continue;
+      
+      bool ok = true;
+      for(auto p : F)
+      {
+        if(power_mod(i,phi/p,n)==1) // not a root
+        {
+          ok=false;
+          break;
+        }
+      }
+      
+      if(ok)
+        return i;
+    }
+    return 0; // no roots found
+  }
+
+  } // namespace detail
+  } } } // namespace boost::math::fft
+#endif
diff --git a/test/Jamfile.v2 b/test/Jamfile.v2
index 7bdf25da01..fb778226cf 100644
--- a/test/Jamfile.v2
+++ b/test/Jamfile.v2
@@ -1466,5 +1466,7 @@ test-suite fft :
    [ run fft_compile.cpp ../config//fftw3 ../config//fftw3f ../config//fftw3l ../config//gsl ]
    [ run fft_correctedness.cpp ../config//fftw3 ../config//fftw3f ../config//fftw3l ../config//fftw3q ../config//gsl ../config//quadmath : <include>../../multiprecision/include ]
    [ run fft_iterators.cpp ../config//fftw3 ../config//fftw3f ../config//fftw3l ../config//gsl ]
+   [ run fft_non-complex.cpp ]
+   [ run fft_auxiliary_functions.cpp ]
    [ compile-fail fft_compile-fail.cpp  ]
 ;
diff --git a/test/fft_auxiliary_functions.cpp b/test/fft_auxiliary_functions.cpp
new file mode 100644
index 0000000000..ae3caac59e
--- /dev/null
+++ b/test/fft_auxiliary_functions.cpp
@@ -0,0 +1,107 @@
+#include "math_unit_test.hpp"
+#include <boost/math/fft/algorithms.hpp>
+
+using namespace boost::math::fft;
+
+void test_is_prime()
+{
+  CHECK_EQUAL(false,detail::is_prime(1));
+  CHECK_EQUAL(true,detail::is_prime(2));
+  CHECK_EQUAL(true,detail::is_prime(3));
+  CHECK_EQUAL(false,detail::is_prime(4));
+  CHECK_EQUAL(true,detail::is_prime(5));
+  CHECK_EQUAL(false,detail::is_prime(6));
+  CHECK_EQUAL(true,detail::is_prime(7));
+  CHECK_EQUAL(false,detail::is_prime(8));
+  CHECK_EQUAL(false,detail::is_prime(9));
+  CHECK_EQUAL(false,detail::is_prime(10));
+  
+  // factorial primes 
+  CHECK_EQUAL(true,detail::is_prime(719));
+  CHECK_EQUAL(true,detail::is_prime(5039));
+  CHECK_EQUAL(true,detail::is_prime(39916801));
+  CHECK_EQUAL(true,detail::is_prime(479001599));
+}
+void test_primitive_root()
+{
+  for(auto p : std::vector<long>{3,5,7,11,13,17,23,29})
+  {
+    const long r = detail::primitive_root(p);
+    std::cerr << "p = " << p << ", root = " << r << '\n';
+    const long phi = p-1;
+    
+    long r_p = r;
+    for(int i=1;i<phi;++i)
+    {
+      CHECK_EQUAL(false,r_p == 1);
+      r_p = (r_p * r) % p;
+    }
+      CHECK_EQUAL(true,r_p == 1);
+  }
+}
+
+void test_power2()
+{
+  CHECK_EQUAL(false,detail::is_power2(-4));   
+  CHECK_EQUAL(false,detail::is_power2(-3));   
+  CHECK_EQUAL(false,detail::is_power2(-2));   
+  CHECK_EQUAL(false,detail::is_power2(-1));   
+  CHECK_EQUAL(false,detail::is_power2(0));   
+  CHECK_EQUAL(false,detail::is_power2(3));   
+  CHECK_EQUAL(false,detail::is_power2(5));   
+  CHECK_EQUAL(false,detail::is_power2(6));   
+  CHECK_EQUAL(false,detail::is_power2(7));   
+  
+  for(int i=9;i<16;++i)
+    CHECK_EQUAL(false,detail::is_power2(i));   
+  
+  CHECK_EQUAL(true,detail::is_power2(1));   
+  CHECK_EQUAL(true,detail::is_power2(2));   
+  CHECK_EQUAL(true,detail::is_power2(4));   
+  CHECK_EQUAL(true,detail::is_power2(8));   
+  CHECK_EQUAL(true,detail::is_power2(16));   
+  
+  for(int i=-20;i<=0;++i)
+    CHECK_EQUAL(0,detail::lower_bound_power2(i));   
+    
+  CHECK_EQUAL(1,detail::lower_bound_power2(1));   
+  CHECK_EQUAL(2,detail::lower_bound_power2(2));   
+  CHECK_EQUAL(2,detail::lower_bound_power2(3));   
+  CHECK_EQUAL(4,detail::lower_bound_power2(4));   
+  CHECK_EQUAL(4,detail::lower_bound_power2(5));   
+  CHECK_EQUAL(4,detail::lower_bound_power2(6));   
+  CHECK_EQUAL(4,detail::lower_bound_power2(7));   
+  for(int i=8;i<16;++i)
+    CHECK_EQUAL(8,detail::lower_bound_power2(i));   
+  for(int i=16;i<32;++i)
+    CHECK_EQUAL(16,detail::lower_bound_power2(i));   
+  
+  for(int i=-20;i<=0;++i)
+    CHECK_EQUAL(1,detail::upper_bound_power2(i));   
+    
+  CHECK_EQUAL(1,detail::upper_bound_power2(1));   
+  CHECK_EQUAL(2,detail::upper_bound_power2(2));   
+  CHECK_EQUAL(4,detail::upper_bound_power2(3));   
+  CHECK_EQUAL(4,detail::upper_bound_power2(4));   
+  CHECK_EQUAL(8,detail::upper_bound_power2(5));   
+  CHECK_EQUAL(8,detail::upper_bound_power2(6));   
+  CHECK_EQUAL(8,detail::upper_bound_power2(7));   
+  CHECK_EQUAL(8,detail::upper_bound_power2(8));   
+  for(int i=9;i<=16;++i)
+    CHECK_EQUAL(16,detail::upper_bound_power2(i));   
+  for(int i=17;i<=32;++i)
+    CHECK_EQUAL(32,detail::upper_bound_power2(i));   
+  CHECK_EQUAL(0,detail::upper_bound_power2(std::numeric_limits<int>::max()));   
+  CHECK_EQUAL(0,detail::upper_bound_power2((1<<30)+1));   
+  CHECK_EQUAL(1<<30,detail::upper_bound_power2(1<<30));   
+  CHECK_EQUAL(1<<30,detail::upper_bound_power2((1<<30) - 1));   
+}
+
+int main()
+{
+  test_power2();
+  test_is_prime(); 
+  test_primitive_root();
+  return boost::math::test::report_errors();
+}
+
diff --git a/test/fft_benchmark.cpp b/test/fft_benchmark.cpp
index 542a87a19f..db96303313 100644
--- a/test/fft_benchmark.cpp
+++ b/test/fft_benchmark.cpp
@@ -2,6 +2,7 @@
 #include <random>
 #include <complex>
 #include <boost/math/fft.hpp>
+#include <boost/math/fft/bsl_backend.hpp>
 #include <boost/math/fft/fftw_backend.hpp>
 #include <boost/math/fft/gsl_backend.hpp>
 #include <boost/math/constants/constants.hpp>
diff --git a/test/fft_compile-fail.cpp b/test/fft_compile-fail.cpp
index 5be867747d..5cac6b1dc5 100644
--- a/test/fft_compile-fail.cpp
+++ b/test/fft_compile-fail.cpp
@@ -1,4 +1,5 @@
 #include <boost/math/fft.hpp>
+#include <boost/math/fft/bsl_backend.hpp>
 
 #include <complex>
 #include <vector>
diff --git a/test/fft_compile.cpp b/test/fft_compile.cpp
index 552e239f3c..29e843f592 100644
--- a/test/fft_compile.cpp
+++ b/test/fft_compile.cpp
@@ -1,6 +1,8 @@
 #include <boost/math/fft.hpp>
+#include <boost/math/fft/bsl_backend.hpp>
 #include <boost/math/fft/fftw_backend.hpp>
 #include <boost/math/fft/gsl_backend.hpp>
+#include <boost/multiprecision/cpp_complex.hpp>
 
 #include <complex>
 #include <vector>
@@ -9,71 +11,88 @@
 template<class T,int N, template< class U> class Backend >
 void transform_api()
 {   
-    // test same type of iterator
-    std::vector<T> A(N),B(A.size());
-    boost::math::fft::dft_forward<Backend>(A.begin(),A.end(),B.begin());
-    boost::math::fft::dft_backward<Backend>(A.begin(),A.end(),B.begin());
-    
-    // test with raw pointers
-    boost::math::fft::dft_forward<Backend>(A.data(),A.data()+A.size(),B.data());
-    boost::math::fft::dft_backward<Backend>(A.data(),A.data()+A.size(),B.data());
+  // test same type of iterator
+  std::vector<T> A(N),B(A.size());
+  boost::math::fft::dft_forward<Backend>(A.begin(),A.end(),B.begin());
+  boost::math::fft::dft_backward<Backend>(A.begin(),A.end(),B.begin());
+  
+  // test with raw pointers
+  boost::math::fft::dft_forward<Backend>(A.data(),A.data()+A.size(),B.data());
+  boost::math::fft::dft_backward<Backend>(A.data(),A.data()+A.size(),B.data());
 
-    const auto & cA = A;
-    // const iterator as input
-    boost::math::fft::dft_forward<Backend>(cA.begin(),cA.end(),B.begin());
-    boost::math::fft::dft_backward<Backend>(cA.begin(),cA.end(),B.begin());
-    
-    // const pointer as input
-    boost::math::fft::dft_forward<Backend>(cA.data(),cA.data()+cA.size(),B.data());
-    boost::math::fft::dft_backward<Backend>(cA.data(),cA.data()+cA.size(),B.data());
-    
-    std::array<T,N> C;
-    // input as vector::iterator, output as array::iterator
-    boost::math::fft::dft_forward<Backend>(A.begin(),A.end(),C.begin());
-    boost::math::fft::dft_backward<Backend>(A.begin(),A.end(),C.begin());
-    boost::math::fft::dft_forward<Backend>(A.data(),A.data()+A.size(),C.data());
-    boost::math::fft::dft_backward<Backend>(A.data(),A.data()+A.size(),C.data());
-    
-    // input as array::iterator, output as vector::iterator
-    boost::math::fft::dft_forward<Backend>(C.begin(),C.end(),B.begin());
-    boost::math::fft::dft_backward<Backend>(C.begin(),C.end(),B.begin());
-    boost::math::fft::dft_forward<Backend>(C.data(),C.data()+C.size(),B.data());
-    boost::math::fft::dft_backward<Backend>(C.data(),C.data()+C.size(),B.data());
+  const auto & cA = A;
+  // const iterator as input
+  boost::math::fft::dft_forward<Backend>(cA.begin(),cA.end(),B.begin());
+  boost::math::fft::dft_backward<Backend>(cA.begin(),cA.end(),B.begin());
+  
+  // const pointer as input
+  boost::math::fft::dft_forward<Backend>(cA.data(),cA.data()+cA.size(),B.data());
+  boost::math::fft::dft_backward<Backend>(cA.data(),cA.data()+cA.size(),B.data());
+  
+  std::array<T,N> C;
+  // input as vector::iterator, output as array::iterator
+  boost::math::fft::dft_forward<Backend>(A.begin(),A.end(),C.begin());
+  boost::math::fft::dft_backward<Backend>(A.begin(),A.end(),C.begin());
+  boost::math::fft::dft_forward<Backend>(A.data(),A.data()+A.size(),C.data());
+  boost::math::fft::dft_backward<Backend>(A.data(),A.data()+A.size(),C.data());
+  
+  // input as array::iterator, output as vector::iterator
+  boost::math::fft::dft_forward<Backend>(C.begin(),C.end(),B.begin());
+  boost::math::fft::dft_backward<Backend>(C.begin(),C.end(),B.begin());
+  boost::math::fft::dft_forward<Backend>(C.data(),C.data()+C.size(),B.data());
+  boost::math::fft::dft_backward<Backend>(C.data(),C.data()+C.size(),B.data());
 }
 
 template<class T,int N, template<class U> class Backend >
 void plan_api()
 {
-    Backend<T> P(N);    
-    std::vector<T> A(N),B(N);
-    P.forward(A.data(),B.data());
-    P.backward(A.data(),B.data());
+  Backend<T> P(N);    
+  std::vector<T> A(N),B(N);
+  P.forward(A.data(),B.data());
+  P.backward(A.data(),B.data());
+}
+
+struct my_type{};
+
+void test_traits()
+{
+  using boost::math::fft::detail::is_complex;
+  static_assert(is_complex< std::complex<float> >::value);
+  static_assert(is_complex< std::complex<double> >::value);
+  static_assert(is_complex< std::complex<long double> >::value);
+  static_assert(is_complex< float >::value==false);
+  static_assert(is_complex< my_type >::value==false);
+  static_assert(is_complex< my_type >::value==false);
+  static_assert(is_complex<
+    std::complex<boost::multiprecision::cpp_bin_float_50> >::value );
+  static_assert(is_complex< boost::multiprecision::cpp_complex_quad >::value );
 }
 
 int main()
 {
-  
-    transform_api< std::complex<float>,      3,boost::math::fft::fftw_dft >();
+  test_traits();
+
+  transform_api< std::complex<float>,      3,boost::math::fft::fftw_dft >();
 #if (not defined(__clang__))
-    transform_api< std::complex<double>,     3,boost::math::fft::fftw_dft >();
+  transform_api< std::complex<double>,     3,boost::math::fft::fftw_dft >();
 #endif
-    transform_api< std::complex<long double>,3,boost::math::fft::fftw_dft >();
-    
-    transform_api< std::complex<double>,3,boost::math::fft::gsl_dft >();
-    
-    transform_api< std::complex<float>,      4,boost::math::fft::bsl_dft >();
-    transform_api< std::complex<double>,     4,boost::math::fft::bsl_dft >();
-    transform_api< std::complex<long double>,4,boost::math::fft::bsl_dft >();
-    
-    plan_api< std::complex<double>,      3, boost::math::fft::fftw_dft >();
-    plan_api< std::complex<float>,       3, boost::math::fft::fftw_dft >();
-    plan_api< std::complex<long double>, 3, boost::math::fft::fftw_dft >();
-    
-    plan_api< std::complex<double>,3, boost::math::fft::gsl_dft >();
-    
-    plan_api< std::complex<double>,      4, boost::math::fft::bsl_dft >();
-    plan_api< std::complex<float>,       4, boost::math::fft::bsl_dft >();
-    plan_api< std::complex<long double>, 4, boost::math::fft::bsl_dft >();
-    
-    return 0;
+  transform_api< std::complex<long double>,3,boost::math::fft::fftw_dft >();
+  
+  transform_api< std::complex<double>,3,boost::math::fft::gsl_dft >();
+  
+  transform_api< std::complex<float>,      4,boost::math::fft::bsl_dft >();
+  transform_api< std::complex<double>,     4,boost::math::fft::bsl_dft >();
+  transform_api< std::complex<long double>,4,boost::math::fft::bsl_dft >();
+  
+  plan_api< std::complex<double>,      3, boost::math::fft::fftw_dft >();
+  plan_api< std::complex<float>,       3, boost::math::fft::fftw_dft >();
+  plan_api< std::complex<long double>, 3, boost::math::fft::fftw_dft >();
+  
+  plan_api< std::complex<double>,3, boost::math::fft::gsl_dft >();
+  
+  plan_api< std::complex<double>,      4, boost::math::fft::bsl_dft >();
+  plan_api< std::complex<float>,       4, boost::math::fft::bsl_dft >();
+  plan_api< std::complex<long double>, 4, boost::math::fft::bsl_dft >();
+  
+  return 0;
 }
diff --git a/test/fft_correctedness.cpp b/test/fft_correctedness.cpp
index 8421c6109d..dac138a8f3 100644
--- a/test/fft_correctedness.cpp
+++ b/test/fft_correctedness.cpp
@@ -1,5 +1,6 @@
 #include "math_unit_test.hpp"
 #include <boost/math/fft.hpp>
+#include <boost/math/fft/bsl_backend.hpp>
 #include <boost/math/fft/fftw_backend.hpp>
 #include <boost/math/fft/gsl_backend.hpp>
 #include <boost/math/fft/algorithms.hpp>
@@ -19,15 +20,69 @@
 
 using namespace boost::math::fft;
 
+template<class T>
+void convolution_brute_force(
+  const T* first1, const T* last1, 
+  const T* first2,
+  T* out)
+{
+  long N = std::distance(first1,last1);
+  for(long i=0;i<N;++i)
+  {
+    T sum{0};
+    for(int j=0;j<N;++j)
+    {
+      sum += first1[j] * first2[(i-j+N) % N];
+    }
+    out[i] = sum;
+  }
+}
 
+template<class T, template<class U> class Backend>
+void test_convolution(unsigned int N, int tolerance)
+{
+  using Complex = typename detail::select_complex<T>::type;
+  const T tol = tolerance*std::numeric_limits<T>::epsilon();
+  
+  // ...
+  boost::random::mt19937 rng;
+  boost::random::uniform_real_distribution<T> U(0.0,1.0);
+  {
+    std::vector<Complex> A(N),B(N),C(N);
+    
+    for(auto& x: A)
+    {
+        x.real( U(rng) );
+        x.imag( U(rng) );
+    }
+    for(auto& x: B)
+    {
+        x.real( U(rng) );
+        x.imag( U(rng) );
+    }
+    convolution_brute_force(A.data(),A.data()+N,B.data(),C.data());
+    
+    std::vector<Complex> C_candidate;
+    convolution<Backend>(A.data(),A.data()+N,B.data(),std::back_inserter(C_candidate));
+    
+    T diff{0.0};
+    
+    for(size_t i=0;i<N;++i)
+    {
+        using std::norm;
+        diff += norm(C[i]-C_candidate[i]);
+    }
+    using std::sqrt;
+    diff = sqrt(diff)/N;
+    CHECK_MOLLIFIED_CLOSE(T{0.0},diff,tol);
+  }
+}
 
 template<class T, template<class U> class Backend>
-void test_fixed_transforms()
+void test_fixed_transforms(int tolerance)
 {
   using Complex = typename detail::select_complex<T>::type;
-  // TODO: increase precision of the generic dft 
-  // const T tol = std::numeric_limits<T>::epsilon();
-    const T tol = 8*std::numeric_limits<T>::epsilon();
+    const T tol = tolerance*std::numeric_limits<T>::epsilon();
     {
         std::vector< Complex > A{1.0},B(1);
         dft_forward<Backend>(A.data(),A.data()+A.size(),B.data());
@@ -79,12 +134,10 @@ void test_fixed_transforms()
 
 
 template<class T, template<class U> class Backend>
-void test_inverse(int N)
+void test_inverse(int N, int tolerance)
 {
   using Complex = typename detail::select_complex<T>::type;
-  // TODO: increase precision of the generic dft 
-  // const T tol = std::numeric_limits<T>::epsilon();
-  const T tol = 128*std::numeric_limits<T>::epsilon();
+  const T tol = tolerance*std::numeric_limits<T>::epsilon();
   
   boost::random::mt19937 rng;
   boost::random::uniform_real_distribution<T> U(0.0,1.0);
@@ -119,131 +172,196 @@ void test_inverse(int N)
 
 int main()
 {
-  test_fixed_transforms<float,fftw_dft>();
-  test_fixed_transforms<double,fftw_dft>();
-  test_fixed_transforms<long double,fftw_dft>();
+  test_fixed_transforms<float,fftw_dft>(1);
+  test_fixed_transforms<double,fftw_dft>(1);
+  test_fixed_transforms<long double,fftw_dft>(1);
+  
+   
 #ifdef BOOST_MATH_USE_FLOAT128
-  test_fixed_transforms<boost::multiprecision::float128,fftw_dft>();
+  test_fixed_transforms<boost::multiprecision::float128,fftw_dft>(1);
 #endif
   
-  test_fixed_transforms<double,gsl_dft>();
+  test_fixed_transforms<double,gsl_dft>(1);
+
+  test_fixed_transforms<float,test_dft_prime_bruteForce>(4);
+  test_fixed_transforms<double,test_dft_prime_bruteForce>(4);
+  test_fixed_transforms<long double,test_dft_prime_bruteForce>(4);
+  
+  test_fixed_transforms<float,test_complex_dft_prime_bruteForce>(2);
+  test_fixed_transforms<double,test_complex_dft_prime_bruteForce>(2);
+  test_fixed_transforms<long double,test_complex_dft_prime_bruteForce>(2);
+  
+  test_fixed_transforms<float,test_dft_composite>(4);
+  test_fixed_transforms<double,test_dft_composite>(4);
+  test_fixed_transforms<long double,test_dft_composite>(4);
+  
+  test_fixed_transforms<float,test_complex_dft_composite>(2);
+  test_fixed_transforms<double,test_complex_dft_composite>(2);
+  test_fixed_transforms<long double,test_complex_dft_composite>(2);
   
-  test_fixed_transforms<float,bsl_dft>();
-  test_fixed_transforms<double,bsl_dft>();
-  test_fixed_transforms<long double,bsl_dft>();
+  test_fixed_transforms<float,bsl_dft>(2);
+  test_fixed_transforms<double,bsl_dft>(2);
+  test_fixed_transforms<long double,bsl_dft>(2);
 #ifdef BOOST_MATH_USE_FLOAT128
-  test_fixed_transforms<boost::multiprecision::float128,bsl_dft>();
+  test_fixed_transforms<boost::multiprecision::float128,bsl_dft>(1);
 #endif
-  test_fixed_transforms<boost::multiprecision::cpp_bin_float_50,bsl_dft>();
-  test_fixed_transforms<boost::multiprecision::cpp_bin_float_100,bsl_dft>();
-  test_fixed_transforms<boost::multiprecision::cpp_bin_float_quad,bsl_dft>();
+  test_fixed_transforms<boost::multiprecision::cpp_bin_float_50,bsl_dft>(2);
+  test_fixed_transforms<boost::multiprecision::cpp_bin_float_100,bsl_dft>(2);
+  test_fixed_transforms<boost::multiprecision::cpp_bin_float_quad,bsl_dft>(1);
   // TODO:
-  //test_fixed_transforms<boost::multiprecision::mpfr_float_100,bsl_dft>();
+  //test_fixed_transforms<boost::multiprecision::mpfr_float_100,bsl_dft>(1);
   
   for(int i=1;i<=(1<<10); i*=2)
   {
-    test_inverse<float,fftw_dft>(i);
-    test_inverse<double,fftw_dft>(i);
-    test_inverse<long double,fftw_dft>(i);
+    test_convolution<double,fftw_dft>(i,128);
+    
+    test_inverse<float,fftw_dft>(i,1);
+    test_inverse<double,fftw_dft>(i,1);
+    test_inverse<long double,fftw_dft>(i,1);
 #ifdef BOOST_MATH_USE_FLOAT128
-    test_inverse<boost::multiprecision::float128,fftw_dft>(i);
+    test_inverse<boost::multiprecision::float128,fftw_dft>(i,1);
 #endif
     
-    test_inverse<double,gsl_dft>(i);
+    test_inverse<double,gsl_dft>(i,1);
     
-    test_inverse<float,bsl_dft>(i);
-    test_inverse<double,bsl_dft>(i);
-    test_inverse<long double,bsl_dft>(i);
+    test_inverse<float,bsl_dft>(i,1);
+    test_inverse<double,bsl_dft>(i,1);
+    test_inverse<long double,bsl_dft>(i,1);
 #ifdef BOOST_MATH_USE_FLOAT128
-    test_inverse<boost::multiprecision::float128,bsl_dft>(i);
+    test_inverse<boost::multiprecision::float128,bsl_dft>(i,1);
 #endif
-    test_inverse<boost::multiprecision::cpp_bin_float_50,bsl_dft>(i);
+    test_inverse<boost::multiprecision::cpp_bin_float_50,bsl_dft>(i,1);
     
-    test_inverse<float,test_dft_power2_dit>(i);
-    test_inverse<float,test_dft_power2_dif>(i);
+    test_inverse<float,test_dft_power2>(i,32);
+    test_inverse<float,test_complex_dft_power2>(i,1);
     
-    test_inverse<double,test_dft_power2_dit>(i);
-    test_inverse<double,test_dft_power2_dif>(i);
+    test_inverse<double,test_dft_power2>(i,32);
+    test_inverse<double,test_complex_dft_power2>(i,1);
     
-    test_inverse<long double,test_dft_power2_dit>(i);
-    test_inverse<long double,test_dft_power2_dif>(i);
+    test_inverse<long double,test_dft_power2>(i,32);
+    test_inverse<long double,test_complex_dft_power2>(i,1);
 
 #ifdef BOOST_MATH_USE_FLOAT128
-    test_inverse<boost::multiprecision::float128,test_dft_power2_dit>(i);
-    test_inverse<boost::multiprecision::float128,test_dft_power2_dif>(i);
+    test_inverse<boost::multiprecision::float128,test_dft_power2>(i,32);
+    test_inverse<boost::multiprecision::float128,test_complex_dft_power2>(i,1);
 #endif
 
-    test_inverse<boost::multiprecision::cpp_bin_float_50,test_dft_power2_dit>(i);
-    test_inverse<boost::multiprecision::cpp_bin_float_50,test_dft_power2_dif>(i);
+    test_inverse<boost::multiprecision::cpp_bin_float_50,test_dft_power2>(i,32);
+    test_inverse<boost::multiprecision::cpp_bin_float_50,test_complex_dft_power2>(i,1);
   }
   for(int i=1;i<=1000; i*=10)
   {
-    test_inverse<float,fftw_dft>(i);
-    test_inverse<double,fftw_dft>(i);
-    test_inverse<long double,fftw_dft>(i);
-    if(i <=10) {
+    test_inverse<float,fftw_dft>(i,1);
+    test_inverse<double,fftw_dft>(i,1);
+    test_inverse<long double,fftw_dft>(i,1);
 #ifdef BOOST_MATH_USE_FLOAT128
-      test_inverse<boost::multiprecision::float128,fftw_dft>(i);
+    test_inverse<boost::multiprecision::float128,fftw_dft>(i,1);
 #endif
-    }
     
-    test_inverse<double,gsl_dft>(i);
+    test_inverse<double,gsl_dft>(i,1);
     
-    test_inverse<float,bsl_dft>(i);
-    test_inverse<double,bsl_dft>(i);
-    test_inverse<long double,bsl_dft>(i);
-    if(i <=10) {
+    test_inverse<float,bsl_dft>(i,1);
+    test_inverse<double,bsl_dft>(i,1);
+    test_inverse<long double,bsl_dft>(i,1);
+    
+    if(i<=10) {
 #ifdef BOOST_MATH_USE_FLOAT128
-      test_inverse<boost::multiprecision::float128,bsl_dft>(i);
+      test_inverse<boost::multiprecision::float128,bsl_dft>(i,2);
 #endif
-      test_inverse<boost::multiprecision::cpp_bin_float_50,bsl_dft>(i);
+      test_inverse<boost::multiprecision::cpp_bin_float_50,bsl_dft>(i,2);
     }
   }
   for(auto i : std::vector<int>{2,3,5,7,11,13,17,23,29,31})
   {
-    test_inverse<float,fftw_dft>(i);
-    test_inverse<double,fftw_dft>(i);
-    test_inverse<long double,fftw_dft>(i);
+    test_inverse<float,fftw_dft>(i,1);
+    test_inverse<double,fftw_dft>(i,1);
+    test_inverse<long double,fftw_dft>(i,1);
 #ifdef BOOST_MATH_USE_FLOAT128
-    test_inverse<boost::multiprecision::float128,fftw_dft>(i);
+    test_inverse<boost::multiprecision::float128,fftw_dft>(i,1);
 #endif
     
-    test_inverse<double,gsl_dft>(i);
+    test_inverse<double,gsl_dft>(i,1);
     
-    test_inverse<float,bsl_dft>(i);
-    test_inverse<double,bsl_dft>(i);
-    test_inverse<long double,bsl_dft>(i);
+    test_inverse<float,bsl_dft>(i,2);
+    test_inverse<double,bsl_dft>(i,2);
+    test_inverse<long double,bsl_dft>(i,2);
 #ifdef BOOST_MATH_USE_FLOAT128
-    test_inverse<boost::multiprecision::float128,bsl_dft>(i);
+    test_inverse<boost::multiprecision::float128,bsl_dft>(i,2);
 #endif
-    test_inverse<boost::multiprecision::cpp_bin_float_50,bsl_dft>(i);
+    test_inverse<boost::multiprecision::cpp_bin_float_50,bsl_dft>(i,2);
     
-    test_inverse<float,test_dft_generic_prime_bruteForce>(i);
-    test_inverse<double,test_dft_generic_prime_bruteForce>(i);
-    test_inverse<long double,test_dft_generic_prime_bruteForce>(i);
+    test_inverse<float,test_dft_prime_bruteForce>(i,i*8);
+    test_inverse<double,test_dft_prime_bruteForce>(i,i*8);
+    test_inverse<long double,test_dft_prime_bruteForce>(i,i*8);
 #ifdef BOOST_MATH_USE_FLOAT128
-    test_inverse<boost::multiprecision::float128,test_dft_generic_prime_bruteForce>(i);
+    test_inverse<boost::multiprecision::float128,test_dft_prime_bruteForce>(i,i*8);
 #endif
-    test_inverse<boost::multiprecision::cpp_bin_float_50,test_dft_generic_prime_bruteForce>(i);
+    test_inverse<boost::multiprecision::cpp_bin_float_50,test_dft_prime_bruteForce>(i,i*8);
     
-    test_inverse<float,test_dft_complex_prime_bruteForce>(i);
-    test_inverse<double,test_dft_complex_prime_bruteForce>(i);
-    test_inverse<long double,test_dft_complex_prime_bruteForce>(i);
+    test_inverse<float,test_complex_dft_prime_bruteForce>(i,i*1);
+    test_inverse<double,test_complex_dft_prime_bruteForce>(i,i*1);
+    test_inverse<long double,test_complex_dft_prime_bruteForce>(i,i*1);
 #ifdef BOOST_MATH_USE_FLOAT128
-    test_inverse<boost::multiprecision::float128,test_dft_complex_prime_bruteForce>(i);
+    test_inverse<boost::multiprecision::float128,test_complex_dft_prime_bruteForce>(i,i*1);
 #endif
-    test_inverse<boost::multiprecision::cpp_bin_float_50,test_dft_complex_prime_bruteForce>(i);
+     test_inverse<boost::multiprecision::cpp_bin_float_50,test_complex_dft_prime_bruteForce>(i,i*1);
+    
+    if(i>2)
+    {
+      // TODO: does rader algorithm works for p=2?
+      test_inverse<float,test_complex_dft_prime_rader>(i,2);
+      test_inverse<double,test_complex_dft_prime_rader>(i,2);
+      test_inverse<long double,test_complex_dft_prime_rader>(i,2);
+#ifdef BOOST_MATH_USE_FLOAT128
+      test_inverse<boost::multiprecision::float128,test_complex_dft_prime_rader>(i,2);
+#endif
+      test_inverse<boost::multiprecision::cpp_bin_float_50,test_complex_dft_prime_rader>(i,2);
+    }
   }
-  // TODO: can we print a useful compilation error message for the following
-  // illegal case?
-  // dft<std::complex<int>> P(3);   
   
   for(int i=1;i<=100;++i)
   {
-    test_inverse<float,test_dft_generic_composite>(i);
-    test_inverse<double,test_dft_generic_composite>(i);
-    test_inverse<long double,test_dft_generic_composite>(i);
+    test_convolution<double,fftw_dft>(i,1024);
+    
+    test_inverse<float,test_dft_composite>(i,1024);
+    test_inverse<double,test_dft_composite>(i,1024);
+    test_inverse<long double,test_dft_composite>(i,1024);
+    
+    test_inverse<float,test_complex_dft_composite>(i,2);
+    test_inverse<double,test_complex_dft_composite>(i,2);
+    test_inverse<long double,test_complex_dft_composite>(i,2);
+  
+    test_inverse<float,test_dft_prime_bruteForce>(i,i*8);
+    test_inverse<double,test_dft_prime_bruteForce>(i,i*8);
+    test_inverse<long double,test_dft_prime_bruteForce>(i,i*8);
+    if(i<=10)
+    {
+#ifdef BOOST_MATH_USE_FLOAT128
+      test_inverse<boost::multiprecision::float128,test_dft_prime_bruteForce>(i,i*8);
+#endif
+      test_inverse<boost::multiprecision::cpp_bin_float_50,test_dft_prime_bruteForce>(i,i*8);
+    }
+    
+    test_inverse<float,test_complex_dft_prime_bruteForce>(i,i*1);
+    test_inverse<double,test_complex_dft_prime_bruteForce>(i,i*1);
+    test_inverse<long double,test_complex_dft_prime_bruteForce>(i,i*1);
+    if(i<=10)
+    {
+#ifdef BOOST_MATH_USE_FLOAT128
+      test_inverse<boost::multiprecision::float128,test_complex_dft_prime_bruteForce>(i,i*1);
+#endif
+      test_inverse<boost::multiprecision::cpp_bin_float_50,test_complex_dft_prime_bruteForce>(i,i*1);
+    }
+    
+    test_inverse<float,bsl_dft>(i,2);
+    test_inverse<double,bsl_dft>(i,2);
+    test_inverse<long double,bsl_dft>(i,2);
   }
+
+  // TODO: can we print a useful compilation error message for the following
+  // illegal case?
+  // dft<std::complex<int>> P(3);   
+  
   
   return boost::math::test::report_errors();
 }
diff --git a/test/fft_iterators.cpp b/test/fft_iterators.cpp
index 3a8aef8e4d..0514706499 100644
--- a/test/fft_iterators.cpp
+++ b/test/fft_iterators.cpp
@@ -1,5 +1,6 @@
 #include "math_unit_test.hpp"
 #include <boost/math/fft.hpp>
+#include <boost/math/fft/bsl_backend.hpp>
 #include <boost/math/fft/fftw_backend.hpp>
 #include <boost/math/fft/gsl_backend.hpp>
 #include <boost/math/constants/constants.hpp>
@@ -52,13 +53,12 @@ typename Container1::value_type::value_type difference(const Container1& A, cons
 }
 
 template<class RealType, template<class U> class Backend>
-void test_inverse(int N)
+void test_inverse(int N, int tolerance)
 {
   using ComplexType = std::complex<RealType>;
-  const RealType tol = 16*std::numeric_limits<RealType>::epsilon();
+  const RealType tol = tolerance*std::numeric_limits<RealType>::epsilon();
   const std::vector< ComplexType > A{random_vector<RealType>(N)};
-  
-  
+
   {
     std::vector<ComplexType> B(N), C(N);
     
@@ -71,39 +71,37 @@ void test_inverse(int N)
   {
     std::vector<ComplexType>  C(N);
     std::list<ComplexType> B;
-    
+
     dft_forward<Backend>(std::begin(A),std::end(A),std::back_inserter(B));
     dft_backward<Backend>(std::begin(B),std::end(B),std::begin(C));
-    
+
     RealType diff{difference(A,C)};
     CHECK_MOLLIFIED_CLOSE(RealType{0.0},diff,tol);
   }
   {
     std::list<ComplexType> C;
-    
+
     dft_forward<Backend>(std::begin(A),std::end(A),std::back_inserter(C));
     dft_backward<Backend>(std::begin(C),std::end(C),std::begin(C));
-    
+
     RealType diff{difference(A,C)};
     CHECK_MOLLIFIED_CLOSE(RealType{0.0},diff,tol);
   }
 }
 
-
 int main()
 {
   for(int i=1;i<=(1<<12); i*=2)
   {
-    test_inverse<float,fftw_dft>(i);
-    test_inverse<double,fftw_dft>(i);
-    test_inverse<long double,fftw_dft>(i);
+    test_inverse<float,fftw_dft>(i,1);
+    test_inverse<double,fftw_dft>(i,1);
+    test_inverse<long double,fftw_dft>(i,1);
     
-    test_inverse<double,gsl_dft>(i);
+    test_inverse<double,gsl_dft>(i,1);
     
-    test_inverse<float,bsl_dft>(i);
-    test_inverse<double,bsl_dft>(i);
-    test_inverse<long double,bsl_dft>(i);
+    test_inverse<float,bsl_dft>(i,1);
+    test_inverse<double,bsl_dft>(i,1);
+    test_inverse<long double,bsl_dft>(i,1);
   }
   return boost::math::test::report_errors();
 }
-
diff --git a/test/fft_non-complex.cpp b/test/fft_non-complex.cpp
new file mode 100644
index 0000000000..53128e129c
--- /dev/null
+++ b/test/fft_non-complex.cpp
@@ -0,0 +1,106 @@
+
+///////////////////////////////////////////////////////////////////
+//  Copyright Eduardo Quintana 2021
+//  Copyright Janek Kozicki 2021
+//  Copyright Christopher Kormanyos 2021
+//  Distributed under the Boost Software License,
+//  Version 1.0. (See accompanying file LICENSE_1_0.txt
+//  or copy at http://www.boost.org/LICENSE_1_0.txt)
+/*
+  boost::math::fft test for non-complex types
+  Use of DFT for Number Theoretical Transform.
+*/
+#include <boost/math/fft.hpp>
+#include <boost/math/fft/bsl_backend.hpp>
+
+namespace fft = boost::math::fft;
+
+#include <iostream>
+#include <vector>
+#include "math_unit_test.hpp"
+#include "fft_test_helpers.hpp"
+
+class Z337
+{
+public:
+  typedef int integer;
+  static constexpr integer mod{337};
+};
+
+void test_inverse()
+{
+  using M_int = fft::my_modulo_lib::mint<Z337>;
+  const M_int w{85};
+  const M_int inv_8{M_int{8}.inverse()};
+
+  std::vector<M_int> A{4, 3, 2, 1, 0, 0, 0, 0};
+  std::vector<M_int> FT_A,FT_FT_A;
+
+  fft::dft_forward<boost::math::fft::bsl_dft>(A.cbegin(),A.cend(),std::back_inserter(FT_A),w);
+
+  fft::dft_backward<boost::math::fft::bsl_dft>(FT_A.cbegin(),FT_A.cend(),std::back_inserter(FT_FT_A),w);
+
+  std::transform(FT_FT_A.begin(), FT_FT_A.end(), FT_FT_A.begin(),
+                 [&inv_8](M_int x) { return x * inv_8; });
+
+  int diff = 0;
+  for (size_t i = 0; i < A.size(); ++i)
+      diff += A[i] == FT_FT_A[i] ? 0 : 1;
+  CHECK_EQUAL(0,diff);
+}
+void test_convolution()
+/*
+  product of two integer by means of the NTT,
+  using the convolution theorem
+*/
+{
+  typedef fft::my_modulo_lib::field_modulo<int, 337> Z337;
+  using M_int = fft::my_modulo_lib::mint<Z337>;
+  const M_int w{85};
+  const M_int inv_8{M_int{8}.inverse()};
+
+  // Multiplying 1234 times 5678 = 7006652
+  std::vector<M_int> A{4, 3, 2, 1, 0, 0, 0, 0};
+  std::vector<M_int> B{8, 7, 6, 5, 0, 0, 0, 0};
+
+  // forward FFT
+  fft::dft_forward<boost::math::fft::bsl_dft>(A.cbegin(),A.cend(),A.begin(), w);
+  fft::dft_forward<boost::math::fft::bsl_dft>(B.cbegin(),B.cend(),B.begin(), w);
+
+  // convolution in Fourier space
+  std::vector<M_int> AB;
+  std::transform(A.begin(), A.end(), B.begin(),
+                 std::back_inserter(AB),
+                 [](M_int x, M_int y) { return x * y; });
+
+  // backwards FFT
+  fft::dft_backward<boost::math::fft::bsl_dft>(AB.cbegin(),AB.cend(),AB.begin(),w);
+  std::transform(AB.begin(), AB.end(), AB.begin(),
+                 [&inv_8](M_int x) { return x * inv_8; });
+
+  // carry the remainders in base 10
+  std::vector<int> C;
+  M_int r{0};
+  for (auto x : AB)
+  {
+    auto y = x + r;
+    C.emplace_back(int(y) % 10);
+    r = M_int(int(y) / 10);
+  }
+  // yields 7006652
+  CHECK_EQUAL(8,static_cast<int>(C.size()));
+  CHECK_EQUAL(2,C[0]);
+  CHECK_EQUAL(5,C[1]);
+  CHECK_EQUAL(6,C[2]);
+  CHECK_EQUAL(6,C[3]);
+  CHECK_EQUAL(0,C[4]);
+  CHECK_EQUAL(0,C[5]);
+  CHECK_EQUAL(7,C[6]);
+  CHECK_EQUAL(0,C[7]);
+}
+int main()
+{
+  test_inverse();
+  test_convolution();
+  return boost::math::test::report_errors();
+}
diff --git a/test/fft_test_helpers.hpp b/test/fft_test_helpers.hpp
index 3394a35c9c..1110e3a392 100644
--- a/test/fft_test_helpers.hpp
+++ b/test/fft_test_helpers.hpp
@@ -10,18 +10,49 @@
   #define BOOST_MATH_FFT_TEST_HELPERS_HPP
 
 #include <boost/math/fft/algorithms.hpp>
-  
+
 namespace boost { namespace math { namespace fft {
 
 template<class NativeComplexType>
-class test_dft_generic_prime_bruteForce
+class test_complex_dft_prime_rader
+{
+  /*
+    Special backend for testing the complex_dft_prime_rader implementation
+  */
+  using real_value_type = typename NativeComplexType::value_type;
+public:
+  constexpr test_complex_dft_prime_rader(std::size_t n)
+    : my_size{n}
+  { }
+  
+  void resize(std::size_t new_size)
+  {
+    my_size = new_size;
+  }
+  constexpr std::size_t size() const { return my_size; }
+
+  void forward(const NativeComplexType* in, NativeComplexType* out) const
+  {
+    detail::complex_dft_prime_rader(in,in+size(),out,1);
+  }
+
+  void backward(const NativeComplexType* in, NativeComplexType* out) const
+  {
+    detail::complex_dft_prime_rader(in,in+size(),out,-1);
+  }
+private:
+  std::size_t my_size;
+};
+
+template<class NativeComplexType>
+class test_dft_prime_bruteForce
 {
   /*
     Special backend for testing the dft_prime_bruteForce implementation
   */
   using real_value_type = typename NativeComplexType::value_type;
 public:
-  constexpr test_dft_generic_prime_bruteForce(std::size_t n)
+  constexpr test_dft_prime_bruteForce(std::size_t n)
     : my_size{n}
   { }
 
@@ -33,36 +64,29 @@ class test_dft_generic_prime_bruteForce
 
   void forward(const NativeComplexType* in, NativeComplexType* out) const
   {
-    using std::cos;
-    using std::sin;
-    using boost::math::constants::pi;
-
-    NativeComplexType w{cos(2*pi<real_value_type>()/size()),-sin(2*pi<real_value_type>()/size())};
-    detail::dft_generic_prime_bruteForce(in,in+size(),out,w);
+    NativeComplexType w{detail::complex_root_of_unity<NativeComplexType>(size())};
+    detail::dft_prime_bruteForce(in,in+size(),out,w);
   }
 
   void backward(const NativeComplexType* in, NativeComplexType* out) const
   {
-    using std::cos;
-    using std::sin;
-    using boost::math::constants::pi;
-
-    NativeComplexType w{cos(2*pi<real_value_type>()/size()),sin(2*pi<real_value_type>()/size())};
-    detail::dft_generic_prime_bruteForce(in,in+size(),out,w);
+    NativeComplexType w{detail::complex_inverse_root_of_unity<NativeComplexType>(size())};
+    detail::dft_prime_bruteForce(in,in+size(),out,w);
   }
 
 private:
   std::size_t my_size;
 };
+
 template<class NativeComplexType>
-class test_dft_generic_composite
+class test_complex_dft_prime_bruteForce
 {
   /*
-    Special backend for testing the dft_composite_dit
+    Special backend for testing the complex_dft_prime_bruteForce implementation
   */
   using real_value_type = typename NativeComplexType::value_type;
 public:
-  constexpr test_dft_generic_composite(std::size_t n)
+  constexpr test_complex_dft_prime_bruteForce(std::size_t n)
     : my_size{n}
   { }
 
@@ -74,22 +98,46 @@ class test_dft_generic_composite
 
   void forward(const NativeComplexType* in, NativeComplexType* out) const
   {
-    using std::cos;
-    using std::sin;
-    using boost::math::constants::pi;
-
-    NativeComplexType w{cos(2*pi<real_value_type>()/size()),-sin(2*pi<real_value_type>()/size())};
-    detail::dft_composite_dit(in,in+size(),out,w);
+    detail::complex_dft_prime_bruteForce(in,in+size(),out,1);
   }
 
   void backward(const NativeComplexType* in, NativeComplexType* out) const
   {
-    using std::cos;
-    using std::sin;
-    using boost::math::constants::pi;
+    detail::complex_dft_prime_bruteForce(in,in+size(),out,-1);
+  }
+
+private:
+  std::size_t my_size;
+};
+
+template<class NativeComplexType>
+class test_dft_composite
+{
+  /*
+    Special backend for testing the dft_composite
+  */
+  using real_value_type = typename NativeComplexType::value_type;
+public:
+  constexpr test_dft_composite(std::size_t n)
+    : my_size{n}
+  { }
+
+  void resize(std::size_t new_size)
+  {
+    my_size = new_size;
+  }
+  constexpr std::size_t size() const { return my_size; }
+
+  void forward(const NativeComplexType* in, NativeComplexType* out) const
+  {
+    NativeComplexType w{detail::complex_root_of_unity<NativeComplexType>(size())};
+    detail::dft_composite(in,in+size(),out,w);
+  }
 
-    NativeComplexType w{cos(2*pi<real_value_type>()/size()),sin(2*pi<real_value_type>()/size())};
-    detail::dft_composite_dit(in,in+size(),out,w);
+  void backward(const NativeComplexType* in, NativeComplexType* out) const
+  {
+    NativeComplexType w{detail::complex_inverse_root_of_unity<NativeComplexType>(size())};
+    detail::dft_composite(in,in+size(),out,w);
   }
 
 private:
@@ -97,14 +145,14 @@ class test_dft_generic_composite
 };
 
 template<class NativeComplexType>
-class test_dft_complex_prime_bruteForce
+class test_complex_dft_composite
 {
   /*
-    Special backend for testing the dft_prime_bruteForce implementation
+    Special backend for testing the complex_dft_composite
   */
   using real_value_type = typename NativeComplexType::value_type;
 public:
-  constexpr test_dft_complex_prime_bruteForce(std::size_t n)
+  constexpr test_complex_dft_composite(std::size_t n)
     : my_size{n}
   { }
 
@@ -116,27 +164,28 @@ class test_dft_complex_prime_bruteForce
 
   void forward(const NativeComplexType* in, NativeComplexType* out) const
   {
-    detail::dft_complex_prime_bruteForce(in,in+size(),out,-1);
+    detail::complex_dft_composite(in,in+size(),out,1);
   }
 
   void backward(const NativeComplexType* in, NativeComplexType* out) const
   {
-    detail::dft_complex_prime_bruteForce(in,in+size(),out,1);
+    detail::complex_dft_composite(in,in+size(),out,-1);
   }
 
 private:
   std::size_t my_size;
 };
+
   
 template<class NativeComplexType>
-class test_dft_power2_dit
+class test_dft_power2
 {
   /*
-    Special backend for testing the dft_power2_dit implementation
+    Special backend for testing the dft_power2 implementation
   */
   using real_value_type = typename NativeComplexType::value_type;
 public:
-  constexpr test_dft_power2_dit(std::size_t n)
+  constexpr test_dft_power2(std::size_t n)
     : my_size{n}
   { }
 
@@ -148,22 +197,14 @@ class test_dft_power2_dit
 
   void forward(const NativeComplexType* in, NativeComplexType* out) const
   {
-    using std::cos;
-    using std::sin;
-    using boost::math::constants::pi;
-
-    NativeComplexType w{cos(2*pi<real_value_type>()/size()),-sin(2*pi<real_value_type>()/size())};
-    detail::dft_power2_dit(in,in+size(),out,w);
+    NativeComplexType w{detail::complex_inverse_root_of_unity<NativeComplexType>(size())};
+    detail::dft_power2(in,in+size(),out,w);
   }
 
   void backward(const NativeComplexType* in, NativeComplexType* out) const
   {
-    using std::cos;
-    using std::sin;
-    using boost::math::constants::pi;
-
-    NativeComplexType w{cos(2*pi<real_value_type>()/size()),sin(2*pi<real_value_type>()/size())};
-    detail::dft_power2_dit(in,in+size(),out,w);
+    NativeComplexType w{detail::complex_root_of_unity<NativeComplexType>(size())};
+    detail::dft_power2(in,in+size(),out,w);
   }
 
 private:
@@ -171,14 +212,14 @@ class test_dft_power2_dit
 };
 
 template<class NativeComplexType>
-class test_dft_power2_dif
+class test_complex_dft_power2
 {
   /*
-    Special backend for testing the dft_power2_dif implementation
+    Special backend for testing the complex_dft_power2 implementation
   */
   using real_value_type = typename NativeComplexType::value_type;
 public:
-  constexpr test_dft_power2_dif(std::size_t n)
+  constexpr test_complex_dft_power2(std::size_t n)
     : my_size{n}
   { }
 
@@ -190,18 +231,111 @@ class test_dft_power2_dif
 
   void forward(const NativeComplexType* in, NativeComplexType* out) const
   {
-    detail::dft_power2_dif(in,in+size(),out,-1);
+    detail::complex_dft_power2(in,in+size(),out,1);
   }
 
   void backward(const NativeComplexType* in, NativeComplexType* out) const
   {
-    detail::dft_power2_dif(in,in+size(),out,1);
+    detail::complex_dft_power2(in,in+size(),out,-1);
   }
 
 private:
   std::size_t my_size;
 };
 
+namespace my_modulo_lib
+{
+  template <typename T, T x>
+  class field_modulo
+  {
+   public:
+    typedef T integer;
+    static constexpr T mod = x;
+  };
+
+  /*
+      Modular Integers
+  */
+  template <typename Field>
+  class mint;
+
+  template <typename Field>
+  std::ostream& operator<<(std::ostream& os, const mint<Field>& A)
+  {
+    return os << A.x << " (mod " << Field::mod << ")";
+  }
+  template <typename Field>
+  class mint
+  {
+    typedef typename Field::integer integer;
+    integer x;
+
+   public:
+    constexpr mint() : x{0} {}
+
+    template <typename int_type>
+    mint(int_type _x) : x{_x}
+    {
+      x %= Field::mod;
+      if (x < 0)
+        x += Field::mod;
+    }
+
+    mint(const mint<Field>& that) : x{that.x} {}
+
+    mint<Field>& operator=(const mint<Field>& that)
+    {
+      return x = that.x, *this;
+    }
+
+    // explicit operator bool() const { return x == integer{0}; }
+    operator integer() const { return x; }
+
+    mint<Field>& operator+=(const mint<Field>& that)
+    {
+      return x = (x + that.x) % Field::mod, *this;
+    }
+
+    mint<Field>& operator*=(const mint<Field>& t)
+    {
+      // direct multiplication
+      x = (x * t.x) % Field::mod;
+      return *this;
+    }
+    bool operator==(const mint<Field>& that) const { return x == that.x; }
+    bool operator!=(const mint<Field>& that) const { return x != that.x; }
+
+    mint<Field> inverse() const { return ::boost::math::fft::detail::power(*this, Field::mod - 2); }
+
+    friend std::ostream& operator<<<Field>(std::ostream& os,
+                                           const mint<Field>& A);
+  };
+
+  template <typename T>
+  mint<T> operator+(const mint<T>& A, const mint<T>& B)
+  {
+    mint<T> C{A};
+    return C += B;
+  }
+  template <typename T>
+  mint<T> operator*(const mint<T>& A, const mint<T>& B)
+  {
+    mint<T> C{A};
+    return C *= B;
+  }
+  template <typename T>
+  mint<T>& operator/=(mint<T>& A, const mint<T>& B)
+  {
+    return A *= B.inverse();
+  }
+  template <typename T>
+  mint<T> operator/(const mint<T>& A, const mint<T>& B)
+  {
+    mint<T> C{A};
+    return C /= B;
+  }
+}  // namespace my_modulo_lib
+
 }}} // namespace boost::math::fft
   
 #endif // BOOST_MATH_FFT_TEST_HELPERS_HPP