From 614613e7d485fc29c0b4b2ed6f4e710e0735c1c2 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 14:39:48 +0100 Subject: [PATCH 01/39] Add the dot routine to the abstract class --- Stream.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Stream.h b/Stream.h index 671289e4..d595122c 100644 --- a/Stream.h +++ b/Stream.h @@ -23,6 +23,7 @@ class Stream virtual void mul() = 0; virtual void add() = 0; virtual void triad() = 0; + virtual T dot() = 0; // Copy memory between host and device virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) = 0; From 0ef9b6691b5cb962bf73093c9482ad7562a3274d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 14:40:08 +0100 Subject: [PATCH 02/39] Implement the reduction in OpenACC --- ACCStream.cpp | 18 ++++++++++++++++++ ACCStream.h | 1 + 2 files changed, 19 insertions(+) diff --git a/ACCStream.cpp b/ACCStream.cpp index d3fbd6ad..c2c1ba59 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -112,6 +112,24 @@ void ACCStream::triad() a[i] = b[i] + scalar * c[i]; } } + +template +T ACCStream::dot() +{ + T sum = 0.0; + + unsigned int array_size = this->array_size; + T * restrict a = this->a; + T * restrict b = this->b; + #pragma acc kernels present(a[0:array_size], b[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + sum += a[i] * b[i]; + } + + return sum; +} + void listDevices(void) { // Get number of devices diff --git a/ACCStream.h b/ACCStream.h index 48fea551..09559e27 100644 --- a/ACCStream.h +++ b/ACCStream.h @@ -35,6 +35,7 @@ class ACCStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; From 04ca3571593b555a310043664c43b9b546ae2371 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 14:40:28 +0100 Subject: [PATCH 03/39] Call the Dot kernel and print out results --- main.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/main.cpp b/main.cpp index 5379bb9a..51f7e480 100644 --- a/main.cpp +++ b/main.cpp @@ -133,7 +133,7 @@ void run() stream->write_arrays(a, b, c); // List of times - std::vector> timings(4); + std::vector> timings(5); // Declare timers std::chrono::high_resolution_clock::time_point t1, t2; @@ -165,6 +165,12 @@ void run() t2 = std::chrono::high_resolution_clock::now(); timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); + // Execute Dot + t1 = std::chrono::high_resolution_clock::now(); + stream->dot(); + t2 = std::chrono::high_resolution_clock::now(); + timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); + } // Check solutions @@ -181,15 +187,16 @@ void run() std::cout << std::fixed; - std::string labels[4] = {"Copy", "Mul", "Add", "Triad"}; - size_t sizes[4] = { + std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"}; + size_t sizes[5] = { 2 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE, - 3 * sizeof(T) * ARRAY_SIZE + 3 * sizeof(T) * ARRAY_SIZE, + 2 * sizeof(T) * ARRAY_SIZE }; - for (int i = 0; i < 4; i++) + for (int i = 0; i < 5; i++) { // Get min/max; ignore the first result auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); From 275bfb2066741370fd06966407587d58eb09b658 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 14:45:28 +0100 Subject: [PATCH 04/39] Check result of the final reduction --- main.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/main.cpp b/main.cpp index 51f7e480..d2fc43a4 100644 --- a/main.cpp +++ b/main.cpp @@ -44,7 +44,7 @@ bool use_float = false; template -void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c); +void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c, T& sum); template void run(); @@ -94,6 +94,9 @@ void run() << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; std::cout.precision(ss); + // Result of the Dot kernel + T sum; + Stream *stream; #if defined(CUDA) @@ -167,7 +170,7 @@ void run() // Execute Dot t1 = std::chrono::high_resolution_clock::now(); - stream->dot(); + sum = stream->dot(); t2 = std::chrono::high_resolution_clock::now(); timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); @@ -175,7 +178,7 @@ void run() // Check solutions stream->read_arrays(a, b, c); - check_solution(num_times, a, b, c); + check_solution(num_times, a, b, c, sum); // Display timing results std::cout @@ -220,12 +223,13 @@ void run() } template -void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c) +void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c, T& sum) { // Generate correct solution T goldA = 0.1; T goldB = 0.2; T goldC = 0.0; + T golSum = 0.0; const T scalar = 0.3; @@ -238,6 +242,9 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldA = goldB + scalar * goldC; } + // Do the reduction + goldSum = goldA * goldB * ntimes; + // Calculate the average error double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); errA /= a.size(); @@ -245,6 +252,7 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector errB /= b.size(); double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); errC /= c.size(); + double errSum = fabs(sum - goldSum); double epsi = std::numeric_limits::epsilon() * 100.0; @@ -260,6 +268,10 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector std::cerr << "Validation failed on c[]. Average error " << errC << std::endl; + if (errSum > epsi) + std::cerr + << "Validation failed on sum. Error " << errSum + << std::endl; } From 08fe695d518b9f1967f9a3d8cc7b33fda5400c04 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 15:04:04 +0100 Subject: [PATCH 05/39] Fix typo in main file --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index d2fc43a4..f1be420d 100644 --- a/main.cpp +++ b/main.cpp @@ -229,7 +229,7 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector T goldA = 0.1; T goldB = 0.2; T goldC = 0.0; - T golSum = 0.0; + T goldSum = 0.0; const T scalar = 0.3; From abe423ac6b4778c3adbf657f2e7dcedea2ff7d23 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 15:05:06 +0100 Subject: [PATCH 06/39] Implement dot kernel in OpenMP 3 --- OMP3Stream.cpp | 15 +++++++++++++++ OMP3Stream.h | 1 + 2 files changed, 16 insertions(+) diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp index fe8323af..04d2deae 100644 --- a/OMP3Stream.cpp +++ b/OMP3Stream.cpp @@ -91,6 +91,21 @@ void OMP3Stream::triad() } } +template +T OMP3Stream::dot() +{ + T sum = 0.0; + + #pragma omp parallel for reduction(+:sum) + for (int i = 0; i < array_size; i++) + { + sum += a[i] * b[i]; + } + + return sum; +} + + void listDevices(void) { std::cout << "0: CPU" << std::endl; diff --git a/OMP3Stream.h b/OMP3Stream.h index 0f14300e..edad55ee 100644 --- a/OMP3Stream.h +++ b/OMP3Stream.h @@ -33,6 +33,7 @@ class OMP3Stream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; From 8a100f07b446b91ba8b98b89b4b5ebc28eb9c30d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 15:19:25 +0100 Subject: [PATCH 07/39] Add dot kernel to OpenMP 4.5 - tested with clang-ykt --- OMP45Stream.cpp | 20 ++++++++++++++++++++ OMP45Stream.h | 1 + 2 files changed, 21 insertions(+) diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp index f849c39d..b0562bc3 100644 --- a/OMP45Stream.cpp +++ b/OMP45Stream.cpp @@ -111,6 +111,26 @@ void OMP45Stream::triad() a[i] = b[i] + scalar * c[i]; } } + +template +T OMP45Stream::dot() +{ + T sum = 0.0; + + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + #pragma omp target teams distribute parallel for simd reduction(+:sum) map(tofrom: sum) + for (int i = 0; i < array_size; i++) + { + sum += a[i] * b[i]; + } + + return sum; +} + + + void listDevices(void) { // Get number of devices diff --git a/OMP45Stream.h b/OMP45Stream.h index bd812a1e..a1febb7a 100644 --- a/OMP45Stream.h +++ b/OMP45Stream.h @@ -36,6 +36,7 @@ class OMP45Stream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; From 2085cacea0349bd51f0256f9be728345d9adc5a7 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 17:07:55 +0100 Subject: [PATCH 08/39] Add an OpenCL dot kernel We have to name the kernel stream_dot (for example) because the "dot" kernel already exists. --- OCLStream.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ OCLStream.h | 9 +++++++++ 2 files changed, 50 insertions(+) diff --git a/OCLStream.cpp b/OCLStream.cpp index 0ed4b8e1..cef5fa60 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -50,6 +50,29 @@ std::string kernels{R"CLC( a[i] = b[i] + scalar * c[i]; } + kernel void stream_dot( + global const TYPE * restrict a, + global const TYPE * restrict b, + global TYPE * restrict sum, + local TYPE * restrict wg_sum) + { + const size_t i = get_global_id(0); + const size_t local_i = get_local_id(0); + wg_sum[local_i] = a[i] * b[i]; + + for (int offset = get_local_size(0) / 2; offset > 0; offset /= 2) + { + barrier(CLK_LOCAL_MEM_FENCE); + if (local_i < offset) + { + wg_sum[local_i] += wg_sum[local_i+offset]; + } + } + + if (local_i == 0) + sum[get_group_id(0)] = wg_sum[local_i]; + } + )CLC"}; @@ -99,6 +122,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) mul_kernel = new cl::KernelFunctor(program, "mul"); add_kernel = new cl::KernelFunctor(program, "add"); triad_kernel = new cl::KernelFunctor(program, "triad"); + dot_kernel = new cl::KernelFunctor(program, "stream_dot"); array_size = ARRAY_SIZE; @@ -114,6 +138,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * WGSIZE); } @@ -166,6 +191,22 @@ void OCLStream::triad() queue.finish(); } +template +T OCLStream::dot() +{ + (*dot_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size), cl::NDRange(WGSIZE)), + d_a, d_b, d_sum, cl::Local(sizeof(T) * WGSIZE) + ); + cl::copy(queue, d_sum, sums.begin(), sums.end()); + + T sum = 0.0; + for (T val : sums) + sum += val; + + return sum; +} + template void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) { diff --git a/OCLStream.h b/OCLStream.h index cb48da54..2f8193ab 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -20,17 +20,24 @@ #define IMPLEMENTATION_STRING "OpenCL" +// Local work-group size for dot kernel +#define WGSIZE 1024 + template class OCLStream : public Stream { protected: // Size of arrays unsigned int array_size; + + // Host array for partial sums for dot kernel + std::vector sums; // Device side pointers to arrays cl::Buffer d_a; cl::Buffer d_b; cl::Buffer d_c; + cl::Buffer d_sum; // OpenCL objects cl::Device device; @@ -41,6 +48,7 @@ class OCLStream : public Stream cl::KernelFunctor * mul_kernel; cl::KernelFunctor *add_kernel; cl::KernelFunctor *triad_kernel; + cl::KernelFunctor *dot_kernel; public: @@ -51,6 +59,7 @@ class OCLStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; From d3b497a9ca359e68c68cc461f16021fd103c2799 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 17:51:40 +0100 Subject: [PATCH 09/39] Add a CUDA dot kernel --- CUDAStream.cu | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- CUDAStream.h | 8 ++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/CUDAStream.cu b/CUDAStream.cu index 802bb055..515540fc 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -8,8 +8,6 @@ #include "CUDAStream.h" -#define TBSIZE 1024 - void check_error(void) { cudaError_t err = cudaGetLastError(); @@ -47,6 +45,9 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; + // Allocate the host array for partial sums for dot kernels + sums = (T*)malloc(sizeof(T) * TBSIZE); + // Check buffers fit on the device cudaDeviceProp props; cudaGetDeviceProperties(&props, 0); @@ -60,12 +61,16 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) check_error(); cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); + cudaMalloc(&d_sum, TBSIZE*sizeof(T)); + check_error(); } template CUDAStream::~CUDAStream() { + free(sums); + cudaFree(d_a); check_error(); cudaFree(d_b); @@ -165,6 +170,46 @@ void CUDAStream::triad() check_error(); } +template +__global__ void dot_kernel(const T * a, const T * b, T * sum) +{ + + extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + T *tb_sum = reinterpret_cast(smem); + + const int i = blockDim.x * blockIdx.x + threadIdx.x; + const size_t local_i = threadIdx.x; + + tb_sum[local_i] = a[i] * b[i]; + + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) + { + __syncthreads(); + if (local_i < offset) + { + tb_sum[local_i] += tb_sum[local_i+offset]; + } + } + + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; +} + +template +T CUDAStream::dot() +{ + dot_kernel<<>>(d_a, d_b, d_sum); + check_error(); + + cudaMemcpy(sums, d_sum, TBSIZE*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); + + T sum = 0.0; + for (int i = 0; i < TBSIZE; i++) + sum += sums[i]; + + return sum; +} void listDevices(void) { diff --git a/CUDAStream.h b/CUDAStream.h index 6904a866..09a72b07 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -15,16 +15,23 @@ #define IMPLEMENTATION_STRING "CUDA" +#define TBSIZE 1024 + template class CUDAStream : public Stream { protected: // Size of arrays unsigned int array_size; + + // Host array for partial sums for dot kernel + T *sums; + // Device side pointers to arrays T *d_a; T *d_b; T *d_c; + T *d_sum; public: @@ -36,6 +43,7 @@ class CUDAStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; From 823e12708fe93cd459c2da1683a7b8accb9ef4ec Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 21 Oct 2016 10:58:26 +0100 Subject: [PATCH 10/39] Add dot kernel to Kokkos --- KOKKOSStream.cpp | 17 +++++++++++++++++ KOKKOSStream.hpp | 1 + 2 files changed, 18 insertions(+) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index d73f7d5a..58dbe944 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -121,6 +121,23 @@ void KOKKOSStream::triad() Kokkos::fence(); } +template +T KOKKOSStream::dot() +{ + View a(*d_a); + View b(*d_b); + + T sum = 0.0; + + parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp) + { + tmp += a[index] * b[index]; + }, sum); + + return sum; + +} + void listDevices(void) { std::cout << "This is not the device you are looking for."; diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp index d2b9665a..013deffd 100644 --- a/KOKKOSStream.hpp +++ b/KOKKOSStream.hpp @@ -47,6 +47,7 @@ class KOKKOSStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void write_arrays( const std::vector& a, const std::vector& b, const std::vector& c) override; From 7408ab0366eebce6ee917743a17dacde36f3c507 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 11:34:40 +0100 Subject: [PATCH 11/39] Add RAJA dot kernel --- RAJAStream.cpp | 17 +++++++++++++++++ RAJAStream.hpp | 3 +++ 2 files changed, 20 insertions(+) diff --git a/RAJAStream.cpp b/RAJAStream.cpp index e418f09f..21c18437 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -109,6 +109,23 @@ void RAJAStream::triad() }); } +template +T RAJAStream::dot() +{ + T* a = d_a; + T* b = d_b; + + RAJA::ReduceSum sum(0.0); + + forall(index_set, [=] RAJA_DEVICE (int index) + { + sum += a[index] * b[index]; + }); + + return T(sum); +} + + void listDevices(void) { std::cout << "This is not the device you are looking for."; diff --git a/RAJAStream.hpp b/RAJAStream.hpp index 454e20e7..768314a4 100644 --- a/RAJAStream.hpp +++ b/RAJAStream.hpp @@ -18,11 +18,13 @@ typedef RAJA::IndexSet::ExecPolicy< RAJA::seq_segit, RAJA::omp_parallel_for_exec> policy; +typedef RAJA::omp_reduce reduce_policy; #else const size_t block_size = 128; typedef RAJA::IndexSet::ExecPolicy< RAJA::seq_segit, RAJA::cuda_exec> policy; +typedef RAJA::cuda_reduce reduce_policy; #endif template @@ -49,6 +51,7 @@ class RAJAStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void write_arrays( const std::vector& a, const std::vector& b, const std::vector& c) override; From 1e948708597df5b9dff63f748b5a23c9bb5aaffd Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 24 Oct 2016 12:47:01 +0100 Subject: [PATCH 12/39] Fix verification of dot kernel --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index fb689cfe..871f06b3 100644 --- a/main.cpp +++ b/main.cpp @@ -249,7 +249,7 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector } // Do the reduction - goldSum = goldA * goldB * ntimes; + goldSum = goldA * goldB * ARRAY_SIZE; // Calculate the average error double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); From 8a8f44b4ce4bf9ff1cba08787d25aa2e1a1182f1 Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 24 Oct 2016 12:47:25 +0100 Subject: [PATCH 13/39] Fix CUDA host code for dot kernel Wrong number of blocks was being copied and summed. --- CUDAStream.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CUDAStream.cu b/CUDAStream.cu index 515540fc..8a74fcb3 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -46,7 +46,7 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * TBSIZE); + sums = (T*)malloc(sizeof(T) * (ARRAY_SIZE/TBSIZE)); // Check buffers fit on the device cudaDeviceProp props; @@ -61,7 +61,7 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) check_error(); cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - cudaMalloc(&d_sum, TBSIZE*sizeof(T)); + cudaMalloc(&d_sum, (ARRAY_SIZE/TBSIZE)*sizeof(T)); check_error(); } @@ -201,11 +201,11 @@ T CUDAStream::dot() dot_kernel<<>>(d_a, d_b, d_sum); check_error(); - cudaMemcpy(sums, d_sum, TBSIZE*sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(sums, d_sum, (array_size/TBSIZE)*sizeof(T), cudaMemcpyDeviceToHost); check_error(); T sum = 0.0; - for (int i = 0; i < TBSIZE; i++) + for (int i = 0; i < (array_size/TBSIZE); i++) sum += sums[i]; return sum; From c9b3d07b84dc100af721e26ac0ff6127765ab1ad Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 24 Oct 2016 12:49:58 +0100 Subject: [PATCH 14/39] Fix OpenCL host code for dot kernel Wrong number of blocks was being copied and summed, and the host sums vector didn't have the correct size. --- OCLStream.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/OCLStream.cpp b/OCLStream.cpp index cef5fa60..3e97f7d2 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -138,8 +138,9 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); - d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * WGSIZE); + d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * (ARRAY_SIZE/WGSIZE)); + sums = std::vector(ARRAY_SIZE/WGSIZE); } template From cfc1aba2c0b40b85b1dcd992118835a25ba50982 Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 24 Oct 2016 12:51:01 +0100 Subject: [PATCH 15/39] Use WGSIZE=256 for dot for compatability with AMD --- OCLStream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OCLStream.h b/OCLStream.h index 2f8193ab..9cda9857 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -21,7 +21,7 @@ #define IMPLEMENTATION_STRING "OpenCL" // Local work-group size for dot kernel -#define WGSIZE 1024 +#define WGSIZE 256 template class OCLStream : public Stream From 644ebc40efa699e770a714fb2a2f00e829fa2802 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 16:22:35 +0100 Subject: [PATCH 16/39] Verify reduction result to 8 decimal places --- main.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index 7e0dd22c..f717d888 100644 --- a/main.cpp +++ b/main.cpp @@ -273,9 +273,12 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector std::cerr << "Validation failed on c[]. Average error " << errC << std::endl; - if (errSum > epsi) + // Check sum to 8 decimal places + if (errSum > 1.0E-8) std::cerr << "Validation failed on sum. Error " << errSum + << std::endl << std::setprecision(15) + << "Sum was " << sum << " but should be " << goldSum << std::endl; } From d5482b74f4c0175786ba4103ff86a10ef8e526db Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 24 Oct 2016 21:26:09 +0100 Subject: [PATCH 17/39] Improve performance of OpenCL dot implementation --- OCLStream.cpp | 19 +++++++++++-------- OCLStream.h | 9 +++++---- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/OCLStream.cpp b/OCLStream.cpp index a1f3f194..928421ff 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -54,11 +54,14 @@ std::string kernels{R"CLC( global const TYPE * restrict a, global const TYPE * restrict b, global TYPE * restrict sum, - local TYPE * restrict wg_sum) + local TYPE * restrict wg_sum, + int array_size) { - const size_t i = get_global_id(0); + size_t i = get_global_id(0); const size_t local_i = get_local_id(0); - wg_sum[local_i] = a[i] * b[i]; + wg_sum[local_i] = 0.0; + for (; i < array_size; i += get_global_size(0)) + wg_sum[local_i] += a[i] * b[i]; for (int offset = get_local_size(0) / 2; offset > 0; offset /= 2) { @@ -128,7 +131,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) mul_kernel = new cl::KernelFunctor(program, "mul"); add_kernel = new cl::KernelFunctor(program, "add"); triad_kernel = new cl::KernelFunctor(program, "triad"); - dot_kernel = new cl::KernelFunctor(program, "stream_dot"); + dot_kernel = new cl::KernelFunctor(program, "stream_dot"); array_size = ARRAY_SIZE; @@ -144,9 +147,9 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); - d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * (ARRAY_SIZE/WGSIZE)); + d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * DOT_NUM_GROUPS); - sums = std::vector(ARRAY_SIZE/WGSIZE); + sums = std::vector(DOT_NUM_GROUPS); } template @@ -202,8 +205,8 @@ template T OCLStream::dot() { (*dot_kernel)( - cl::EnqueueArgs(queue, cl::NDRange(array_size), cl::NDRange(WGSIZE)), - d_a, d_b, d_sum, cl::Local(sizeof(T) * WGSIZE) + cl::EnqueueArgs(queue, cl::NDRange(DOT_NUM_GROUPS*DOT_WGSIZE), cl::NDRange(DOT_WGSIZE)), + d_a, d_b, d_sum, cl::Local(sizeof(T) * DOT_WGSIZE), array_size ); cl::copy(queue, d_sum, sums.begin(), sums.end()); diff --git a/OCLStream.h b/OCLStream.h index 97e3f93d..20e5049e 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -21,8 +21,9 @@ #define IMPLEMENTATION_STRING "OpenCL" -// Local work-group size for dot kernel -#define WGSIZE 256 +// NDRange configuration for the dot kernel +#define DOT_WGSIZE 256 +#define DOT_NUM_GROUPS 256 template class OCLStream : public Stream @@ -30,7 +31,7 @@ class OCLStream : public Stream protected: // Size of arrays unsigned int array_size; - + // Host array for partial sums for dot kernel std::vector sums; @@ -49,7 +50,7 @@ class OCLStream : public Stream cl::KernelFunctor * mul_kernel; cl::KernelFunctor *add_kernel; cl::KernelFunctor *triad_kernel; - cl::KernelFunctor *dot_kernel; + cl::KernelFunctor *dot_kernel; public: From dfc79eeb4dafbc16e2253892e20bc2c36812a91b Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 24 Oct 2016 21:42:39 +0100 Subject: [PATCH 18/39] Improve performance of CUDA dot implementation --- CUDAStream.cu | 18 ++++++++++-------- CUDAStream.h | 1 + 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/CUDAStream.cu b/CUDAStream.cu index 0f809bac..5d5a5109 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -46,7 +46,7 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * (ARRAY_SIZE/TBSIZE)); + sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); // Check buffers fit on the device cudaDeviceProp props; @@ -61,7 +61,7 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) check_error(); cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - cudaMalloc(&d_sum, (ARRAY_SIZE/TBSIZE)*sizeof(T)); + cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); check_error(); } @@ -171,16 +171,18 @@ void CUDAStream::triad() } template -__global__ void dot_kernel(const T * a, const T * b, T * sum) +__global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size) { extern __shared__ __align__(sizeof(T)) unsigned char smem[]; T *tb_sum = reinterpret_cast(smem); - const int i = blockDim.x * blockIdx.x + threadIdx.x; + int i = blockDim.x * blockIdx.x + threadIdx.x; const size_t local_i = threadIdx.x; - tb_sum[local_i] = a[i] * b[i]; + tb_sum[local_i] = 0.0; + for (; i < array_size; i += blockDim.x*gridDim.x) + tb_sum[local_i] += a[i] * b[i]; for (int offset = blockDim.x / 2; offset > 0; offset /= 2) { @@ -198,14 +200,14 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum) template T CUDAStream::dot() { - dot_kernel<<>>(d_a, d_b, d_sum); + dot_kernel<<>>(d_a, d_b, d_sum, array_size); check_error(); - cudaMemcpy(sums, d_sum, (array_size/TBSIZE)*sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost); check_error(); T sum = 0.0; - for (int i = 0; i < (array_size/TBSIZE); i++) + for (int i = 0; i < DOT_NUM_BLOCKS; i++) sum += sums[i]; return sum; diff --git a/CUDAStream.h b/CUDAStream.h index 09a72b07..8fcd6e5e 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -16,6 +16,7 @@ #define IMPLEMENTATION_STRING "CUDA" #define TBSIZE 1024 +#define DOT_NUM_BLOCKS 256 template class CUDAStream : public Stream From e5b67ac969ef05206579db19e262ab0ec5f93103 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 25 Oct 2016 12:22:01 +0100 Subject: [PATCH 19/39] Version bump --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d012749..fd80697f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,7 @@ include(CheckIncludeFileCXX) include(CheckCXXCompilerFlag) set(gpu-stream_VERSION_MAJOR 2) -set(gpu-stream_VERSION_MINOR 1) +set(gpu-stream_VERSION_MINOR 2) configure_file(common.h.in common.h) include_directories(${CMAKE_BINARY_DIR}) From ed630e7dbc358e84f2c62f4768a9c9a2708eee95 Mon Sep 17 00:00:00 2001 From: James Price Date: Tue, 25 Oct 2016 16:39:23 +0100 Subject: [PATCH 20/39] [SYCL] Implement dot kernel --- SYCLStream.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ SYCLStream.h | 2 ++ 2 files changed, 50 insertions(+) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 215f161d..e5fd9c67 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -13,6 +13,9 @@ using namespace cl::sycl; #define WGSIZE 256 +#define DOT_WGSIZE 256 +#define DOT_NUM_GROUPS 256 + // Cache list of devices bool cached = false; std::vector devices; @@ -48,6 +51,7 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = new buffer(array_size); d_b = new buffer(array_size); d_c = new buffer(array_size); + d_sum = new buffer(DOT_NUM_GROUPS); } template @@ -56,6 +60,7 @@ SYCLStream::~SYCLStream() delete d_a; delete d_b; delete d_c; + delete d_sum; delete queue; } @@ -124,6 +129,49 @@ void SYCLStream::triad() queue->wait(); } +template +T SYCLStream::dot() +{ + queue->submit([&](handler &cgh) + { + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto ksum = d_sum->template get_access(cgh); + + auto wg_sum = accessor(range<1>(DOT_WGSIZE), cgh); + + size_t N = array_size; + + cgh.parallel_for(nd_range<1>(DOT_NUM_GROUPS*DOT_WGSIZE, DOT_WGSIZE), [=](nd_item<1> item) + { + size_t i = item.get_global(0); + size_t li = item.get_local(0); + wg_sum[li] = 0.0; + for (; i < N; i += item.get_global_range()[0]) + wg_sum[li] += ka[i] * kb[i]; + + for (int offset = item.get_local_range()[0]; offset > 0; offset /= 2) + { + item.barrier(cl::sycl::access::fence_space::local_space); + if (li < offset) + wg_sum[li] += wg_sum[li + offset]; + } + + if (li == 0) + ksum[item.get_group(0)] = wg_sum[0]; + }); + }); + + T sum = 0.0; + auto h_sum = d_sum->template get_access(); + for (int i = 0; i < DOT_NUM_GROUPS; i++) + { + sum += h_sum[i]; + } + + return sum; +} + template void SYCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) { diff --git a/SYCLStream.h b/SYCLStream.h index 8bc515db..ce3225ef 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -27,6 +27,7 @@ class SYCLStream : public Stream cl::sycl::buffer *d_a; cl::sycl::buffer *d_b; cl::sycl::buffer *d_c; + cl::sycl::buffer *d_sum; public: @@ -37,6 +38,7 @@ class SYCLStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; From 21556af50070722f6c2fc255588c2b0afb99f7ba Mon Sep 17 00:00:00 2001 From: James Price Date: Wed, 26 Oct 2016 15:18:13 +0100 Subject: [PATCH 21/39] [OCL] Automatically determine dot NDRange config --- OCLStream.cpp | 21 +++++++++++++++++---- OCLStream.h | 8 ++++---- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/OCLStream.cpp b/OCLStream.cpp index 928421ff..199eff50 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -90,9 +90,22 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) throw std::runtime_error("Invalid device index"); device = devices[device_index]; + // Determine sensible dot kernel NDRange configuration + if (device.getInfo() & CL_DEVICE_TYPE_CPU) + { + dot_num_groups = device.getInfo(); + dot_wgsize = device.getInfo() * 2; + } + else + { + dot_num_groups = device.getInfo() * 4; + dot_wgsize = device.getInfo(); + } + // Print out device information std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + std::cout << "Dot kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; context = cl::Context(device); queue = cl::CommandQueue(context); @@ -147,9 +160,9 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); - d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * DOT_NUM_GROUPS); + d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * dot_num_groups); - sums = std::vector(DOT_NUM_GROUPS); + sums = std::vector(dot_num_groups); } template @@ -205,8 +218,8 @@ template T OCLStream::dot() { (*dot_kernel)( - cl::EnqueueArgs(queue, cl::NDRange(DOT_NUM_GROUPS*DOT_WGSIZE), cl::NDRange(DOT_WGSIZE)), - d_a, d_b, d_sum, cl::Local(sizeof(T) * DOT_WGSIZE), array_size + cl::EnqueueArgs(queue, cl::NDRange(dot_num_groups*dot_wgsize), cl::NDRange(dot_wgsize)), + d_a, d_b, d_sum, cl::Local(sizeof(T) * dot_wgsize), array_size ); cl::copy(queue, d_sum, sums.begin(), sums.end()); diff --git a/OCLStream.h b/OCLStream.h index 20e5049e..ab10a7bc 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -21,10 +21,6 @@ #define IMPLEMENTATION_STRING "OpenCL" -// NDRange configuration for the dot kernel -#define DOT_WGSIZE 256 -#define DOT_NUM_GROUPS 256 - template class OCLStream : public Stream { @@ -52,6 +48,10 @@ class OCLStream : public Stream cl::KernelFunctor *triad_kernel; cl::KernelFunctor *dot_kernel; + // NDRange configuration for the dot kernel + size_t dot_num_groups; + size_t dot_wgsize; + public: OCLStream(const unsigned int, const int); From cbf97dc7d989ba88a55126dc735df4258665b9b0 Mon Sep 17 00:00:00 2001 From: James Price Date: Wed, 26 Oct 2016 15:18:31 +0100 Subject: [PATCH 22/39] [SYCL] Automatically determine dot NDRange config --- SYCLStream.cpp | 24 +++++++++++++++++------- SYCLStream.h | 4 ++++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index e5fd9c67..e78651bb 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -13,9 +13,6 @@ using namespace cl::sycl; #define WGSIZE 256 -#define DOT_WGSIZE 256 -#define DOT_NUM_GROUPS 256 - // Cache list of devices bool cached = false; std::vector devices; @@ -41,9 +38,22 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) throw std::runtime_error("Invalid device index"); device dev = devices[device_index]; + // Determine sensible dot kernel NDRange configuration + if (dev.is_cpu()) + { + dot_num_groups = dev.get_info(); + dot_wgsize = dev.get_info() * 2; + } + else + { + dot_num_groups = dev.get_info() * 4; + dot_wgsize = dev.get_info(); + } + // Print out device information std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + std::cout << "Dot kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; queue = new cl::sycl::queue(dev); @@ -51,7 +61,7 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = new buffer(array_size); d_b = new buffer(array_size); d_c = new buffer(array_size); - d_sum = new buffer(DOT_NUM_GROUPS); + d_sum = new buffer(dot_num_groups); } template @@ -138,11 +148,11 @@ T SYCLStream::dot() auto kb = d_b->template get_access(cgh); auto ksum = d_sum->template get_access(cgh); - auto wg_sum = accessor(range<1>(DOT_WGSIZE), cgh); + auto wg_sum = accessor(range<1>(dot_wgsize), cgh); size_t N = array_size; - cgh.parallel_for(nd_range<1>(DOT_NUM_GROUPS*DOT_WGSIZE, DOT_WGSIZE), [=](nd_item<1> item) + cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) { size_t i = item.get_global(0); size_t li = item.get_local(0); @@ -164,7 +174,7 @@ T SYCLStream::dot() T sum = 0.0; auto h_sum = d_sum->template get_access(); - for (int i = 0; i < DOT_NUM_GROUPS; i++) + for (int i = 0; i < dot_num_groups; i++) { sum += h_sum[i]; } diff --git a/SYCLStream.h b/SYCLStream.h index ce3225ef..6f7205b2 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -29,6 +29,10 @@ class SYCLStream : public Stream cl::sycl::buffer *d_c; cl::sycl::buffer *d_sum; + // NDRange configuration for the dot kernel + size_t dot_num_groups; + size_t dot_wgsize; + public: SYCLStream(const unsigned int, const int); From d7c48c50630a554a93117f018cf80aea542c2318 Mon Sep 17 00:00:00 2001 From: James Price Date: Wed, 26 Oct 2016 15:47:10 +0100 Subject: [PATCH 23/39] Slight tweak to dot config output to fix parsing scripts --- OCLStream.cpp | 2 +- SYCLStream.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/OCLStream.cpp b/OCLStream.cpp index 199eff50..8e2bf5e1 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -105,7 +105,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) // Print out device information std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; - std::cout << "Dot kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; + std::cout << "Reduction kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; context = cl::Context(device); queue = cl::CommandQueue(context); diff --git a/SYCLStream.cpp b/SYCLStream.cpp index e78651bb..29a065cf 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -53,7 +53,7 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) // Print out device information std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; - std::cout << "Dot kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; + std::cout << "Reduction kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; queue = new cl::sycl::queue(dev); From 98962c4aeec31774f355254ec8e80d787b4ba3f4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 27 Oct 2016 15:19:16 +0100 Subject: [PATCH 24/39] Add Kokkos CPU Makefile --- KokkosCPUMakefile | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 KokkosCPUMakefile diff --git a/KokkosCPUMakefile b/KokkosCPUMakefile new file mode 100644 index 00000000..9dd45f4c --- /dev/null +++ b/KokkosCPUMakefile @@ -0,0 +1,10 @@ + +default: gpu-stream-kokkos + +include $(KOKKOS_PATH)/Makefile.kokkos + +gpu-stream-kokkos: main.o KOKKOSStream.o + $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU From 4e395729667e1c1e0fef5482edd5761913f843cd Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 27 Oct 2016 15:41:23 +0100 Subject: [PATCH 25/39] Add O3 to Kokkos CPU --- KokkosCPUMakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/KokkosCPUMakefile b/KokkosCPUMakefile index 9dd45f4c..caa8b77d 100644 --- a/KokkosCPUMakefile +++ b/KokkosCPUMakefile @@ -4,7 +4,7 @@ default: gpu-stream-kokkos include $(KOKKOS_PATH)/Makefile.kokkos gpu-stream-kokkos: main.o KOKKOSStream.o - $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU + $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3 %.o:%.cpp $(KOKKOS_CPP_DEPENDS) - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3 From dd296d2231c94e0d8ae9ba6c7a52a84c523d94c9 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 28 Oct 2016 21:15:12 +0100 Subject: [PATCH 26/39] [SYCL] Prebuild dot kernel like the others --- SYCLStream.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index e56f530f..60a79a56 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -24,6 +24,7 @@ namespace kernels { class mul; class add; class triad; + class dot; } template @@ -63,7 +64,7 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) p->build_from_kernel_name(); p->build_from_kernel_name(); p->build_from_kernel_name(); - + p->build_from_kernel_name(); // Create buffers d_a = new buffer(array_size); @@ -169,7 +170,7 @@ T SYCLStream::dot() size_t N = array_size; - cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) + cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) { size_t i = item.get_global(0); size_t li = item.get_local(0); From 7f4761ae52546851dcecd8283794f2ce055b8335 Mon Sep 17 00:00:00 2001 From: James Price Date: Wed, 2 Nov 2016 11:22:01 +0000 Subject: [PATCH 27/39] Replace write_arrays with init_arrays This allows each model to initialise their arrays with a parallel approach, which yields the first touch required for good performance on NUMA architectures. --- ACCStream.cpp | 18 ++++++++++++------ ACCStream.h | 2 +- CUDAStream.cu | 19 +++++++++++++------ CUDAStream.h | 2 +- HIPStream.cu | 18 ++++++++++++------ HIPStream.h | 2 +- KOKKOSStream.cpp | 20 ++++++++++---------- KOKKOSStream.hpp | 3 +-- OCLStream.cpp | 25 ++++++++++++++++++++----- OCLStream.h | 3 ++- OMP3Stream.cpp | 8 ++++---- OMP3Stream.h | 2 +- OMP45Stream.cpp | 12 +++++++++--- OMP45Stream.h | 2 +- RAJAStream.cpp | 21 ++++++++++----------- RAJAStream.hpp | 3 +-- SYCLStream.cpp | 26 +++++++++++++++++--------- SYCLStream.h | 2 +- Stream.h | 2 +- main.cpp | 8 ++++---- 20 files changed, 122 insertions(+), 76 deletions(-) diff --git a/ACCStream.cpp b/ACCStream.cpp index bd496631..ccc942ab 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -36,13 +36,19 @@ ACCStream::~ACCStream() } template -void ACCStream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void ACCStream::init_arrays(T initA, T initB, T initC) { - T *a = this->a; - T *b = this->b; - T *c = this->c; - #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size]) - {} + unsigned int array_size = this->array_size; + T * restrict a = this->a; + T * restrict b = this->b; + T * restrict c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } } template diff --git a/ACCStream.h b/ACCStream.h index 48fea551..54f947ba 100644 --- a/ACCStream.h +++ b/ACCStream.h @@ -36,7 +36,7 @@ class ACCStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; diff --git a/CUDAStream.cu b/CUDAStream.cu index ff2ec413..08026d93 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -74,15 +74,22 @@ CUDAStream::~CUDAStream() check_error(); } + +template +__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] = initA; + b[i] = initB; + c[i] = initC; +} + template -void CUDAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void CUDAStream::init_arrays(T initA, T initB, T initC) { - // Copy host memory to device - cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice); + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); check_error(); - cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice); - check_error(); - cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); check_error(); } diff --git a/CUDAStream.h b/CUDAStream.h index 6904a866..912721e2 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -37,7 +37,7 @@ class CUDAStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/HIPStream.cu b/HIPStream.cu index 34ecfb6c..8c02348a 100644 --- a/HIPStream.cu +++ b/HIPStream.cu @@ -74,15 +74,21 @@ HIPStream::~HIPStream() check_error(); } +template +__global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC) +{ + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + a[i] = initA; + b[i] = initB; + c[i] = initC; +} + template -void HIPStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void HIPStream::init_arrays(T initA, T initB, T initC) { - // Copy host memory to device - hipMemcpy(d_a, a.data(), a.size()*sizeof(T), hipMemcpyHostToDevice); - check_error(); - hipMemcpy(d_b, b.data(), b.size()*sizeof(T), hipMemcpyHostToDevice); + hipLaunchKernel(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); check_error(); - hipMemcpy(d_c, c.data(), c.size()*sizeof(T), hipMemcpyHostToDevice); + hipDeviceSynchronize(); check_error(); } diff --git a/HIPStream.h b/HIPStream.h index 9015e354..392080ad 100644 --- a/HIPStream.h +++ b/HIPStream.h @@ -37,7 +37,7 @@ class HIPStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 94ac7ee2..72b1ee52 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -34,18 +34,18 @@ KOKKOSStream::~KOKKOSStream() } template -void KOKKOSStream::write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) +void KOKKOSStream::init_arrays(T initA, T initB, T initC) { - for(int ii = 0; ii < array_size; ++ii) + View a(*d_a); + View b(*d_b); + View c(*d_c); + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - (*hm_a)(ii) = a[ii]; - (*hm_b)(ii) = b[ii]; - (*hm_c)(ii) = c[ii]; - } - deep_copy(*d_a, *hm_a); - deep_copy(*d_b, *hm_b); - deep_copy(*d_c, *hm_c); + a[index] = initA; + b[index] - initB; + c[index] = initC; + }); + Kokkos::fence(); } template diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp index d2b9665a..ff7cfeb9 100644 --- a/KOKKOSStream.hpp +++ b/KOKKOSStream.hpp @@ -48,8 +48,7 @@ class KOKKOSStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays( std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OCLStream.cpp b/OCLStream.cpp index 2a1e5ee1..c7e09a8c 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -16,6 +16,18 @@ std::string kernels{R"CLC( constant TYPE scalar = startScalar; + kernel void init( + global TYPE * restrict a, + global TYPE * restrict b, + global TYPE * restrict c, + TYPE initA, TYPE initB, TYPE initC) + { + const size_t i = get_global_id(0); + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + kernel void copy( global const TYPE * restrict a, global TYPE * restrict c) @@ -101,6 +113,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) } // Create kernels + init_kernel = new cl::KernelFunctor(program, "init"); copy_kernel = new cl::KernelFunctor(program, "copy"); mul_kernel = new cl::KernelFunctor(program, "mul"); add_kernel = new cl::KernelFunctor(program, "add"); @@ -120,12 +133,12 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); - } template OCLStream::~OCLStream() { + delete init_kernel; delete copy_kernel; delete mul_kernel; delete add_kernel; @@ -173,11 +186,13 @@ void OCLStream::triad() } template -void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void OCLStream::init_arrays(T initA, T initB, T initC) { - cl::copy(queue, a.begin(), a.end(), d_a); - cl::copy(queue, b.begin(), b.end(), d_b); - cl::copy(queue, c.begin(), c.end(), d_c); + (*init_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_b, d_c, initA, initB, initC + ); + queue.finish(); } template diff --git a/OCLStream.h b/OCLStream.h index 54abaa39..845e144a 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -38,6 +38,7 @@ class OCLStream : public Stream cl::Context context; cl::CommandQueue queue; + cl::KernelFunctor *init_kernel; cl::KernelFunctor *copy_kernel; cl::KernelFunctor * mul_kernel; cl::KernelFunctor *add_kernel; @@ -53,7 +54,7 @@ class OCLStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp index f578c7c6..b5e1bc22 100644 --- a/OMP3Stream.cpp +++ b/OMP3Stream.cpp @@ -26,14 +26,14 @@ OMP3Stream::~OMP3Stream() template -void OMP3Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void OMP3Stream::init_arrays(T initA, T initB, T initC) { #pragma omp parallel for for (int i = 0; i < array_size; i++) { - a[i] = h_a[i]; - b[i] = h_b[i]; - c[i] = h_c[i]; + a[i] = initA; + b[i] = initB; + c[i] = initC; } } diff --git a/OMP3Stream.h b/OMP3Stream.h index 0f14300e..1dadc952 100644 --- a/OMP3Stream.h +++ b/OMP3Stream.h @@ -34,7 +34,7 @@ class OMP3Stream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp index 8f684e29..17226623 100644 --- a/OMP45Stream.cpp +++ b/OMP45Stream.cpp @@ -35,13 +35,19 @@ OMP45Stream::~OMP45Stream() } template -void OMP45Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void OMP45Stream::init_arrays(T initA, T initB, T initC) { + unsigned int array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size]) - {} + #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + for (int i = 0; i < array_size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } } template diff --git a/OMP45Stream.h b/OMP45Stream.h index bd812a1e..d2a5aafe 100644 --- a/OMP45Stream.h +++ b/OMP45Stream.h @@ -37,7 +37,7 @@ class OMP45Stream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 33687a10..d872987a 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -21,12 +21,6 @@ RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = new T[ARRAY_SIZE]; d_b = new T[ARRAY_SIZE]; d_c = new T[ARRAY_SIZE]; - forall(index_set, [=] RAJA_DEVICE (int index) - { - d_a[index] = 0.0; - d_b[index] = 0.0; - d_c[index] = 0.0; - }); #else cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); @@ -50,12 +44,17 @@ RAJAStream::~RAJAStream() } template -void RAJAStream::write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) +void RAJAStream::init_arrays(T initA, T initB, T initC) { - std::copy(a.begin(), a.end(), d_a); - std::copy(b.begin(), b.end(), d_b); - std::copy(c.begin(), c.end(), d_c); + T* a = d_a; + T* b = d_b; + T* c = d_c; + forall(index_set, [=] RAJA_DEVICE (int index) + { + a[index] = initA; + b[index] = initB; + c[index] = initC; + }); } template diff --git a/RAJAStream.hpp b/RAJAStream.hpp index 454e20e7..8ffa5bec 100644 --- a/RAJAStream.hpp +++ b/RAJAStream.hpp @@ -50,8 +50,7 @@ class RAJAStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays( std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 12e96b48..919a657e 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -20,6 +20,7 @@ program * p; /* Forward declaration of SYCL kernels */ namespace kernels { + class init; class copy; class mul; class add; @@ -46,6 +47,7 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) /* Pre-build the kernels */ p = new program(queue->get_context()); + p->build_from_kernel_name(); p->build_from_kernel_name(); p->build_from_kernel_name(); p->build_from_kernel_name(); @@ -142,17 +144,23 @@ void SYCLStream::triad() } template -void SYCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void SYCLStream::init_arrays(T initA, T initB, T initC) { - auto _a = d_a->template get_access(); - auto _b = d_b->template get_access(); - auto _c = d_c->template get_access(); - for (int i = 0; i < array_size; i++) + queue->submit([&](handler &cgh) { - _a[i] = a[i]; - _b[i] = b[i]; - _c[i] = c[i]; - } + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) + { + auto id = item.get(); + ka[id[0]] = initA; + kb[id[0]] = initB; + kc[id[0]] = initC; + }); + }); + queue->wait(); } template diff --git a/SYCLStream.h b/SYCLStream.h index 8bc515db..4bd21d80 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -38,7 +38,7 @@ class SYCLStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/Stream.h b/Stream.h index 631e3058..44b4d8b2 100644 --- a/Stream.h +++ b/Stream.h @@ -31,7 +31,7 @@ class Stream virtual void triad() = 0; // Copy memory between host and device - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) = 0; + virtual void init_arrays(T initA, T initB, T initC) = 0; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; }; diff --git a/main.cpp b/main.cpp index 6a15aa76..cb6241f9 100644 --- a/main.cpp +++ b/main.cpp @@ -84,9 +84,9 @@ void run() std::cout << "Precision: double" << std::endl; // Create host vectors - std::vector a(ARRAY_SIZE, startA); - std::vector b(ARRAY_SIZE, startB); - std::vector c(ARRAY_SIZE, startC); + std::vector a(ARRAY_SIZE); + std::vector b(ARRAY_SIZE); + std::vector c(ARRAY_SIZE); std::streamsize ss = std::cout.precision(); std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" @@ -135,7 +135,7 @@ void run() #endif - stream->write_arrays(a, b, c); + stream->init_arrays(startA, startB, startC); // List of times std::vector> timings(4); From cb2221a64a16a76922c315781091993f61169e07 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 16 Nov 2016 08:29:54 -0700 Subject: [PATCH 28/39] Add a common.h file --- .gitignore | 2 -- common.h | 9 +++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 common.h diff --git a/.gitignore b/.gitignore index 4d2865ce..6ef20a53 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,4 @@ -common.h - gpu-stream-cuda gpu-stream-ocl gpu-stream-acc diff --git a/common.h b/common.h new file mode 100644 index 00000000..a66aa1a7 --- /dev/null +++ b/common.h @@ -0,0 +1,9 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#define VERSION_STRING "devel" + From ffac9fc352656306570e67128487488dbd9d2b9e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 16 Nov 2016 12:50:20 -0600 Subject: [PATCH 29/39] [OMP45] Use alloc instead to to allocate device memory This fixes #11 --- OMP45Stream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp index 3ba2d409..8ba04341 100644 --- a/OMP45Stream.cpp +++ b/OMP45Stream.cpp @@ -18,7 +18,7 @@ OMP45Stream::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int this->a = a; this->b = b; this->c = c; - #pragma omp target enter data map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size]) {} } From 02bff60870fb41c732f5bff1d01f26257970afb1 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 17 Nov 2016 21:01:30 +0000 Subject: [PATCH 30/39] [SYCL] Fix start index in reduction loop --- SYCLStream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 6d2cc3f0..94420fc4 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -180,7 +180,7 @@ T SYCLStream::dot() for (; i < N; i += item.get_global_range()[0]) wg_sum[li] += ka[i] * kb[i]; - for (int offset = item.get_local_range()[0]; offset > 0; offset /= 2) + for (int offset = item.get_local_range()[0] / 2; offset > 0; offset /= 2) { item.barrier(cl::sycl::access::fence_space::local_space); if (li < offset) From 66776d5839a2260964e7b4a420a3470b9f400c37 Mon Sep 17 00:00:00 2001 From: James Price Date: Thu, 17 Nov 2016 23:52:13 +0000 Subject: [PATCH 31/39] [SYCL] Use consistent syntax for indexing --- SYCLStream.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 94420fc4..c8a908c9 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -98,7 +98,7 @@ void SYCLStream::copy() range<1>{array_size}, [=](item<1> item) { auto id = item.get(); - kc[id[0]] = ka[id[0]]; + kc[id] = ka[id]; }); }); queue->wait(); @@ -116,7 +116,7 @@ void SYCLStream::mul() range<1>{array_size}, [=](item<1> item) { auto id = item.get(); - kb[id[0]] = scalar * kc[id[0]]; + kb[id] = scalar * kc[id]; }); }); queue->wait(); @@ -134,7 +134,7 @@ void SYCLStream::add() range<1>{array_size}, [=](item<1> item) { auto id = item.get(); - kc[id[0]] = ka[id[0]] + kb[id[0]]; + kc[id] = ka[id] + kb[id]; }); }); queue->wait(); @@ -153,7 +153,7 @@ void SYCLStream::triad() range<1>{array_size}, [=](item<1> item) { auto id = item.get(); - ka[id] = kb[id[0]] + scalar * kc[id[0]]; + ka[id] = kb[id] + scalar * kc[id]; }); }); queue->wait(); From 1e976ff1502fb86ab618920c666d2afb7f170034 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 18 Nov 2016 00:14:46 +0000 Subject: [PATCH 32/39] [SYCL] Fix multiple template specializations --- SYCLStream.cpp | 39 +++++++++++++++------------------------ SYCLStream.h | 18 ++++++++++++++++++ main.cpp | 4 +--- 3 files changed, 34 insertions(+), 27 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index c8a908c9..6160fc11 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -18,16 +18,6 @@ std::vector devices; void getDeviceList(void); program * p; -/* Forward declaration of SYCL kernels */ -namespace kernels { - class init; - class copy; - class mul; - class add; - class triad; - class dot; -} - template SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) { @@ -61,12 +51,12 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) /* Pre-build the kernels */ p = new program(queue->get_context()); - p->build_from_kernel_name(); - p->build_from_kernel_name(); - p->build_from_kernel_name(); - p->build_from_kernel_name(); - p->build_from_kernel_name(); - p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); // Create buffers d_a = new buffer(array_size); @@ -94,7 +84,7 @@ void SYCLStream::copy() { auto ka = d_a->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), + cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { auto id = item.get(); @@ -112,7 +102,7 @@ void SYCLStream::mul() { auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), + cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { auto id = item.get(); @@ -130,7 +120,7 @@ void SYCLStream::add() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), + cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { auto id = item.get(); @@ -149,7 +139,7 @@ void SYCLStream::triad() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), + cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { auto id = item.get(); @@ -172,7 +162,8 @@ T SYCLStream::dot() size_t N = array_size; - cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) + cgh.parallel_for(p->get_kernel(), + nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) { size_t i = item.get_global(0); size_t li = item.get_local(0); @@ -210,8 +201,8 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) { auto id = item.get(); ka[id[0]] = initA; @@ -311,5 +302,5 @@ std::string getDeviceDriver(const int device) // TODO: Fix kernel names to allow multiple template specializations -//template class SYCLStream; +template class SYCLStream; template class SYCLStream; diff --git a/SYCLStream.h b/SYCLStream.h index f3c8d252..ab62ecde 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -15,6 +15,16 @@ #define IMPLEMENTATION_STRING "SYCL" +namespace sycl_kernels +{ + template class init; + template class copy; + template class mul; + template class add; + template class triad; + template class dot; +} + template class SYCLStream : public Stream { @@ -29,6 +39,14 @@ class SYCLStream : public Stream cl::sycl::buffer *d_c; cl::sycl::buffer *d_sum; + // SYCL kernel names + typedef sycl_kernels::init init_kernel; + typedef sycl_kernels::copy copy_kernel; + typedef sycl_kernels::mul mul_kernel; + typedef sycl_kernels::add add_kernel; + typedef sycl_kernels::triad triad_kernel; + typedef sycl_kernels::dot dot_kernel; + // NDRange configuration for the dot kernel size_t dot_num_groups; size_t dot_wgsize; diff --git a/main.cpp b/main.cpp index 16e32415..2d80814d 100644 --- a/main.cpp +++ b/main.cpp @@ -61,13 +61,11 @@ int main(int argc, char *argv[]) parseArguments(argc, argv); - // TODO: Fix SYCL to allow multiple template specializations -#ifndef SYCL + // TODO: Fix Kokkos to allow multiple template specializations #ifndef KOKKOS if (use_float) run(); else -#endif #endif run(); From db01715806a2a1772b525f8dc2cad3deb21eccd4 Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 18 Nov 2016 00:35:36 +0000 Subject: [PATCH 33/39] [SYCL] Explictly use first dimension of ranges --- SYCLStream.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 6160fc11..abe048cc 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -87,7 +87,7 @@ void SYCLStream::copy() cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { - auto id = item.get(); + auto id = item.get()[0]; kc[id] = ka[id]; }); }); @@ -105,7 +105,7 @@ void SYCLStream::mul() cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { - auto id = item.get(); + auto id = item.get()[0]; kb[id] = scalar * kc[id]; }); }); @@ -123,7 +123,7 @@ void SYCLStream::add() cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { - auto id = item.get(); + auto id = item.get()[0]; kc[id] = ka[id] + kb[id]; }); }); @@ -142,7 +142,7 @@ void SYCLStream::triad() cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { - auto id = item.get(); + auto id = item.get()[0]; ka[id] = kb[id] + scalar * kc[id]; }); }); @@ -167,11 +167,14 @@ T SYCLStream::dot() { size_t i = item.get_global(0); size_t li = item.get_local(0); + size_t global_size = item.get_global_range()[0]; + wg_sum[li] = 0.0; - for (; i < N; i += item.get_global_range()[0]) + for (; i < N; i += global_size) wg_sum[li] += ka[i] * kb[i]; - for (int offset = item.get_local_range()[0] / 2; offset > 0; offset /= 2) + size_t local_size = item.get_local_range()[0]; + for (int offset = local_size / 2; offset > 0; offset /= 2) { item.barrier(cl::sycl::access::fence_space::local_space); if (li < offset) @@ -204,10 +207,10 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { - auto id = item.get(); - ka[id[0]] = initA; - kb[id[0]] = initB; - kc[id[0]] = initC; + auto id = item.get()[0]; + ka[id] = initA; + kb[id] = initB; + kc[id] = initC; }); }); queue->wait(); From e6615944f4e18c1e511653190720f5c10c951045 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 9 Dec 2016 11:49:58 +0000 Subject: [PATCH 34/39] Use a compiler switch to select OpenMP directives (target or parallel for) --- OMP45Stream.cpp => OMPStream.cpp | 72 +++++++++++++++++++++++++------- OMP45Stream.h => OMPStream.h | 6 +-- main.cpp | 16 +++---- 3 files changed, 66 insertions(+), 28 deletions(-) rename OMP45Stream.cpp => OMPStream.cpp (71%) rename OMP45Stream.h => OMPStream.h (87%) diff --git a/OMP45Stream.cpp b/OMPStream.cpp similarity index 71% rename from OMP45Stream.cpp rename to OMPStream.cpp index 8ba04341..da51937e 100644 --- a/OMP45Stream.cpp +++ b/OMPStream.cpp @@ -5,25 +5,31 @@ // For full license terms please see the LICENSE file distributed with this // source code -#include "OMP45Stream.h" +#include "OMPStream.h" template -OMP45Stream::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) +OMPStream::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) { - omp_set_default_device(device); - array_size = ARRAY_SIZE; +#ifdef OMP_TARGET_GPU + omp_set_default_device(device); // Set up data region on device this->a = a; this->b = b; this->c = c; #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size]) {} +#else + // Allocate on the host + this->a = (T*)malloc(sizeof(T)*array_size); + this->b = (T*)malloc(sizeof(T)*array_size); + this->c = (T*)malloc(sizeof(T)*array_size); +#endif } template -OMP45Stream::~OMP45Stream() +OMPStream::~OMPStream() { // End data region on device unsigned int array_size = this->array_size; @@ -35,13 +41,17 @@ OMP45Stream::~OMP45Stream() } template -void OMP45Stream::init_arrays(T initA, T initB, T initC) +void OMPStream::init_arrays(T initA, T initB, T initC) { unsigned int array_size = this->array_size; +#ifdef OMP_TARGET_GPU T *a = this->a; T *b = this->b; T *c = this->c; #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) +#else + #pragma omp parallel for +#endif for (int i = 0; i < array_size; i++) { a[i] = initA; @@ -51,22 +61,36 @@ void OMP45Stream::init_arrays(T initA, T initB, T initC) } template -void OMP45Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void OMPStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { +#ifdef OMP_TARGET_GPU T *a = this->a; T *b = this->b; T *c = this->c; #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size]) {} +#else + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } +#endif } template -void OMP45Stream::copy() +void OMPStream::copy() { +#ifdef OMP_TARGET_GPU unsigned int array_size = this->array_size; T *a = this->a; T *c = this->c; #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size]) +#else + #pragma omp parallel for +#endif for (int i = 0; i < array_size; i++) { c[i] = a[i]; @@ -74,14 +98,18 @@ void OMP45Stream::copy() } template -void OMP45Stream::mul() +void OMPStream::mul() { const T scalar = startScalar; +#ifdef OMP_TARGET_GPU unsigned int array_size = this->array_size; T *b = this->b; T *c = this->c; #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size]) +#else + #pragma omp parallel for +#endif for (int i = 0; i < array_size; i++) { b[i] = scalar * c[i]; @@ -89,13 +117,17 @@ void OMP45Stream::mul() } template -void OMP45Stream::add() +void OMPStream::add() { +#ifdef OMP_TARGET_GPU unsigned int array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) +#else + #pragma omp parallel for +#endif for (int i = 0; i < array_size; i++) { c[i] = a[i] + b[i]; @@ -103,15 +135,19 @@ void OMP45Stream::add() } template -void OMP45Stream::triad() +void OMPStream::triad() { const T scalar = startScalar; +#ifdef OMP_TARGET_GPU unsigned int array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) +#else + #pragma omp parallel for +#endif for (int i = 0; i < array_size; i++) { a[i] = b[i] + scalar * c[i]; @@ -119,14 +155,18 @@ void OMP45Stream::triad() } template -T OMP45Stream::dot() +T OMPStream::dot() { T sum = 0.0; +#ifdef OMP_TARGET_GPU unsigned int array_size = this->array_size; T *a = this->a; T *b = this->b; #pragma omp target teams distribute parallel for simd reduction(+:sum) map(tofrom: sum) +#else + #pragma omp parallel for reduction(+:sum) +#endif for (int i = 0; i < array_size; i++) { sum += a[i] * b[i]; @@ -139,6 +179,7 @@ T OMP45Stream::dot() void listDevices(void) { +#ifdef OMP_TARGET_GPU // Get number of devices int count = omp_get_num_devices(); @@ -151,6 +192,9 @@ void listDevices(void) { std::cout << "There are " << count << " devices." << std::endl; } +#else + std::cout << "0: CPU" << std::endl; +#endif } std::string getDeviceName(const int) @@ -162,5 +206,5 @@ std::string getDeviceDriver(const int) { return std::string("Device driver unavailable"); } -template class OMP45Stream; -template class OMP45Stream; +template class OMPStream; +template class OMPStream; diff --git a/OMP45Stream.h b/OMPStream.h similarity index 87% rename from OMP45Stream.h rename to OMPStream.h index e99fdeb6..08af1942 100644 --- a/OMP45Stream.h +++ b/OMPStream.h @@ -17,7 +17,7 @@ #define IMPLEMENTATION_STRING "OpenMP 4.5" template -class OMP45Stream : public Stream +class OMPStream : public Stream { protected: // Size of arrays @@ -29,8 +29,8 @@ class OMP45Stream : public Stream T *c; public: - OMP45Stream(const unsigned int, T*, T*, T*, int); - ~OMP45Stream(); + OMPStream(const unsigned int, T*, T*, T*, int); + ~OMPStream(); virtual void copy() override; virtual void add() override; diff --git a/main.cpp b/main.cpp index 2d80814d..c73322fa 100644 --- a/main.cpp +++ b/main.cpp @@ -32,10 +32,8 @@ #include "ACCStream.h" #elif defined(SYCL) #include "SYCLStream.h" -#elif defined(OMP3) -#include "OMP3Stream.h" -#elif defined(OMP45) -#include "OMP45Stream.h" +#elif defined(OMP) +#include "OMPStream.h" #endif // Default size of 2^25 @@ -126,13 +124,9 @@ void run() // Use the SYCL implementation stream = new SYCLStream(ARRAY_SIZE, deviceIndex); -#elif defined(OMP3) - // Use the "reference" OpenMP 3 implementation - stream = new OMP3Stream(ARRAY_SIZE, a.data(), b.data(), c.data()); - -#elif defined(OMP45) - // Use the "reference" OpenMP 3 implementation - stream = new OMP45Stream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); +#elif defined(OMP) + // Use the OpenMP implementation + stream = new OMPStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); #endif From 469d8d563472560ba490170eb871103d16a88525 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 9 Dec 2016 12:19:07 +0000 Subject: [PATCH 35/39] Remove old OpenMP 3 code --- OMP3Stream.cpp | 126 ------------------------------------------------- OMP3Stream.h | 41 ---------------- 2 files changed, 167 deletions(-) delete mode 100644 OMP3Stream.cpp delete mode 100644 OMP3Stream.h diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp deleted file mode 100644 index 6334b653..00000000 --- a/OMP3Stream.cpp +++ /dev/null @@ -1,126 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "OMP3Stream.h" - -template -OMP3Stream::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c) -{ - array_size = ARRAY_SIZE; - this->a = (T*)malloc(sizeof(T)*array_size); - this->b = (T*)malloc(sizeof(T)*array_size); - this->c = (T*)malloc(sizeof(T)*array_size); -} - -template -OMP3Stream::~OMP3Stream() -{ - free(a); - free(b); - free(c); -} - - -template -void OMP3Stream::init_arrays(T initA, T initB, T initC) -{ - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - a[i] = initA; - b[i] = initB; - c[i] = initC; - } -} - -template -void OMP3Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) -{ - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - h_a[i] = a[i]; - h_b[i] = b[i]; - h_c[i] = c[i]; - } -} - -template -void OMP3Stream::copy() -{ - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - c[i] = a[i]; - } -} - -template -void OMP3Stream::mul() -{ - const T scalar = startScalar; - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - b[i] = scalar * c[i]; - } -} - -template -void OMP3Stream::add() -{ - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - c[i] = a[i] + b[i]; - } -} - -template -void OMP3Stream::triad() -{ - const T scalar = startScalar; - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - a[i] = b[i] + scalar * c[i]; - } -} - -template -T OMP3Stream::dot() -{ - T sum = 0.0; - - #pragma omp parallel for reduction(+:sum) - for (int i = 0; i < array_size; i++) - { - sum += a[i] * b[i]; - } - - return sum; -} - - -void listDevices(void) -{ - std::cout << "0: CPU" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} - - -template class OMP3Stream; -template class OMP3Stream; diff --git a/OMP3Stream.h b/OMP3Stream.h deleted file mode 100644 index b6ae1c91..00000000 --- a/OMP3Stream.h +++ /dev/null @@ -1,41 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once - -#include -#include - -#include "Stream.h" - -#define IMPLEMENTATION_STRING "Reference OpenMP" - -template -class OMP3Stream : public Stream -{ - protected: - // Size of arrays - unsigned int array_size; - // Device side pointers - T *a; - T *b; - T *c; - - public: - OMP3Stream(const unsigned int, T*, T*, T*); - ~OMP3Stream(); - - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - -}; From 1d4a5dc3466fa6be52e6f3edb56ed8acb7aee699 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 9 Dec 2016 12:22:30 +0000 Subject: [PATCH 36/39] Make OpenMP string name without version number --- OMPStream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OMPStream.h b/OMPStream.h index 08af1942..c475274f 100644 --- a/OMPStream.h +++ b/OMPStream.h @@ -14,7 +14,7 @@ #include -#define IMPLEMENTATION_STRING "OpenMP 4.5" +#define IMPLEMENTATION_STRING "OpenMP" template class OMPStream : public Stream From d0dd48406c61ba3ea17e97eeef7bd0749f899c51 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 9 Dec 2016 12:36:25 +0000 Subject: [PATCH 37/39] Move version string to main removing common dependency --- common.h | 9 --------- common.h.in | 9 --------- main.cpp | 3 ++- 3 files changed, 2 insertions(+), 19 deletions(-) delete mode 100644 common.h delete mode 100644 common.h.in diff --git a/common.h b/common.h deleted file mode 100644 index a66aa1a7..00000000 --- a/common.h +++ /dev/null @@ -1,9 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#define VERSION_STRING "devel" - diff --git a/common.h.in b/common.h.in deleted file mode 100644 index 1b0f38bd..00000000 --- a/common.h.in +++ /dev/null @@ -1,9 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@" - diff --git a/main.cpp b/main.cpp index c73322fa..c1ca69f7 100644 --- a/main.cpp +++ b/main.cpp @@ -15,7 +15,8 @@ #include #include -#include "common.h" +#define VERSION_STRING "devel" + #include "Stream.h" #if defined(CUDA) From d75b0004de9a1fc071af2eea37b37772e2d3cce8 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 13 Dec 2016 11:45:30 +0000 Subject: [PATCH 38/39] [OMP] Update deconstructor to only call target region if building for GPU --- OMPStream.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/OMPStream.cpp b/OMPStream.cpp index da51937e..189cacb6 100644 --- a/OMPStream.cpp +++ b/OMPStream.cpp @@ -31,6 +31,7 @@ OMPStream::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int dev template OMPStream::~OMPStream() { +#ifdef OMP_TARGET_GPU // End data region on device unsigned int array_size = this->array_size; T *a = this->a; @@ -38,6 +39,11 @@ OMPStream::~OMPStream() T *c = this->c; #pragma omp target exit data map(release: a[0:array_size], b[0:array_size], c[0:array_size]) {} +#else + free(a); + free(b); + free(c); +#endif } template From b9c514fd9bf63b57aa0730ac39bacff6f82576c5 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 19 Dec 2016 11:42:45 +0000 Subject: [PATCH 39/39] [CUDA] Free the sum device buffer --- CUDAStream.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CUDAStream.cu b/CUDAStream.cu index 043c8c71..7b1e0df8 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -77,6 +77,8 @@ CUDAStream::~CUDAStream() check_error(); cudaFree(d_c); check_error(); + cudaFree(d_sum); + check_error(); }