Skip to content

Commit

Permalink
Align BabelStream with alpaka version
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Jan 10, 2024
1 parent a5b3024 commit e51a145
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 66 deletions.
3 changes: 3 additions & 0 deletions examples/alpaka/babelstream/.clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
BasedOnStyle: InheritParentConfig
QualifierAlignment: Custom
QualifierOrder: ['friend', 'static', 'inline', 'constexpr', 'type', 'const', 'volatile']
91 changes: 37 additions & 54 deletions examples/alpaka/babelstream/AlpakaStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@
#include "AlpakaStream.h"

#include <numeric>
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
# include <cuda_runtime.h>
#endif

namespace
{
Expand All @@ -28,7 +25,7 @@ namespace
T& ref;

// NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
LLAMA_ACC LLAMA_FORCE_INLINE auto operator=(const Reference& r) -> Reference&
LLAMA_ACC LLAMA_FORCE_INLINE auto operator=(Reference const& r) -> Reference&
{
*this = static_cast<T>(r);
return *this;
Expand Down Expand Up @@ -78,9 +75,9 @@ AlpakaStream<T>::AlpakaStream(Idx arraySize, Idx deviceIndex)
struct InitKernel
{
template<typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(const TAcc& acc, T* a, T* b, T* c, T initA, T initB, T initC) const
ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA, T initB, T initC) const
{
const auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
a[i] = initA;
b[i] = initB;
c[i] = initC;
Expand All @@ -90,7 +87,7 @@ struct InitKernel
template<typename T>
void AlpakaStream<T>::init_arrays(T initA, T initB, T initC)
{
const auto workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(
queue,
Expand All @@ -108,176 +105,163 @@ void AlpakaStream<T>::init_arrays(T initA, T initB, T initC)
template<typename T>
void AlpakaStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
// TODO(bgruber): avoid temporary alpaka views when we upgrade to alpaka 1.0.0
auto va = alpaka::createView(devHost, a, arraySize);
alpaka::memcpy(queue, va, d_a);
auto vb = alpaka::createView(devHost, b, arraySize);
alpaka::memcpy(queue, vb, d_b);
auto vc = alpaka::createView(devHost, c, arraySize);
alpaka::memcpy(queue, vc, d_c);
alpaka::memcpy(queue, alpaka::createView(devHost, a), d_a);
alpaka::memcpy(queue, alpaka::createView(devHost, b), d_b);
alpaka::memcpy(queue, alpaka::createView(devHost, c), d_c);
}

struct CopyKernel
{
template<typename TAcc, typename ViewA, typename ViewB>
ALPAKA_FN_ACC void operator()(const TAcc& acc, ViewA a, ViewB c) const
ALPAKA_FN_ACC void operator()(TAcc const& acc, ViewA a, ViewB c) const
{
const auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
c[i] = a[i];
}
};

template<typename T>
void AlpakaStream<T>::copy()
{
const auto workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);

auto viewA
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_a))}, Accessor{}};
auto viewC
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_c))}, Accessor{}};

alpaka::exec<Acc>(queue, workdiv, CopyKernel{}, viewA, viewC);
alpaka::wait(queue);
}

struct MulKernel
{
template<typename TAcc, typename ViewB, typename ViewC>
ALPAKA_FN_ACC void operator()(const TAcc& acc, ViewB b, ViewC c) const
ALPAKA_FN_ACC void operator()(TAcc const& acc, ViewB b, ViewC c) const
{
const typename ViewB::RecordDim scalar = startScalar;
const auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
b[i] = scalar * c[i];
}
};

template<typename T>
void AlpakaStream<T>::mul()
{
const auto workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);

auto viewB
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_b))}, Accessor{}};
auto viewC
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_c))}, Accessor{}};

alpaka::exec<Acc>(queue, workdiv, MulKernel{}, viewB, viewC);
alpaka::wait(queue);
}

struct AddKernel
{
template<typename TAcc, typename ViewA, typename ViewB, typename ViewC>
ALPAKA_FN_ACC void operator()(const TAcc& acc, ViewA a, ViewB b, ViewC c) const
ALPAKA_FN_ACC void operator()(TAcc const& acc, ViewA a, ViewB b, ViewC c) const
{
const auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
c[i] = a[i] + b[i];
}
};

template<typename T>
void AlpakaStream<T>::add()
{
const auto workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);

auto viewA
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_a))}, Accessor{}};
auto viewB
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_b))}, Accessor{}};
auto viewC
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_c))}, Accessor{}};

alpaka::exec<Acc>(queue, workdiv, AddKernel{}, viewA, viewB, viewC);
alpaka::wait(queue);
}

struct TriadKernel
{
template<typename TAcc, typename ViewA, typename ViewB, typename ViewC>
ALPAKA_FN_ACC void operator()(const TAcc& acc, ViewA a, ViewB b, ViewC c) const
ALPAKA_FN_ACC void operator()(TAcc const& acc, ViewA a, ViewB b, ViewC c) const
{
const typename ViewB::RecordDim scalar = startScalar;
const auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
a[i] = b[i] + scalar * c[i];
}
};

template<typename T>
void AlpakaStream<T>::triad()
{
const auto workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);

auto viewA
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_a))}, Accessor{}};
auto viewB
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_b))}, Accessor{}};
auto viewC
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_c))}, Accessor{}};

alpaka::exec<Acc>(queue, workdiv, TriadKernel{}, viewA, viewB, viewC);
alpaka::wait(queue);
}

struct NstreamKernel
{
template<typename TAcc, typename ViewA, typename ViewB, typename ViewC>
ALPAKA_FN_ACC void operator()(const TAcc& acc, ViewA a, ViewB b, ViewC c) const
ALPAKA_FN_ACC void operator()(TAcc const& acc, ViewA a, ViewB b, ViewC c) const
{
const typename ViewB::RecordDim scalar = startScalar;
const auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
a[i] += b[i] + scalar * c[i];
}
};

template<typename T>
void AlpakaStream<T>::nstream()
{
const auto workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);

auto viewA
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_a))}, Accessor{}};
auto viewB
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_b))}, Accessor{}};
auto viewC
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_c))}, Accessor{}};

alpaka::exec<Acc>(queue, workdiv, NstreamKernel{}, viewA, viewB, viewC);
alpaka::wait(queue);
}

struct DotKernel
{
template<typename TAcc, typename ViewA, typename ViewB, typename ViewSum>
ALPAKA_FN_ACC void operator()(const TAcc& acc, ViewA a, ViewB b, ViewSum sum, int arraySize) const
ALPAKA_FN_ACC void operator()(TAcc const& acc, ViewA a, ViewB b, ViewSum sum, int arraySize) const
{
using T = typename ViewA::RecordDim;

// TODO(Jeff Young) - test if sharedMem bug is affecting performance here
auto& tbSum = alpaka::declareSharedVar<T[blockSize], __COUNTER__>(acc);

auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
const auto [local_i] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
const auto [totalThreads] = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
auto const [local_i] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
auto const [totalThreads] = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

tbSum[local_i] = 0.0;
T threadSum = 0;
for(; i < arraySize; i += totalThreads) // NOLINT(bugprone-infinite-loop)
tbSum[local_i] += a[i] * b[i];
threadSum += a[i] * b[i];
tbSum[local_i] = threadSum;

const auto [blockDim] = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
auto const [blockDim] = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
for(int offset = blockDim / 2; offset > 0; offset /= 2)
{
alpaka::syncBlockThreads(acc);
if(local_i < offset)
tbSum[local_i] += tbSum[local_i + offset];
}

const auto [blockIdx] = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);
auto const [blockIdx] = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);
if(local_i == 0)
sum[blockIdx] = tbSum[local_i];
}
Expand All @@ -286,35 +270,34 @@ struct DotKernel
template<typename T>
auto AlpakaStream<T>::dot() -> T
{
const auto workdiv = WorkDiv{dotBlockSize, blockSize, 1};
auto const workdiv = WorkDiv{dotBlockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, dotBlockSize * blockSize);

auto viewA
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_a))}, Accessor{}};
auto viewB
= llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_b))}, Accessor{}};
auto viewSum = llama::View{mapping, llama::Array{reinterpret_cast<std::byte*>(alpaka::getPtrNative(d_sum))}};

alpaka::exec<Acc>(queue, workdiv, DotKernel{}, viewA, viewB, viewSum, arraySize);
alpaka::wait(queue);

alpaka::memcpy(queue, sums, d_sum);
const T* sumPtr = alpaka::getPtrNative(sums);
return std::reduce(sumPtr, sumPtr + dotBlockSize);
T const* sumPtr = alpaka::getPtrNative(sums);
// TODO(bgruber): replace by std::reduce, when gcc 9.3 is the baseline
return std::accumulate(sumPtr, sumPtr + dotBlockSize, T{0});
}

void listDevices()
{
const auto platform = alpaka::Platform<Acc>{};
const auto count = alpaka::getDevCount(platform);
auto const platform = alpaka::Platform<Acc>{};
auto const count = alpaka::getDevCount(platform);
std::cout << "Devices:" << std::endl;
for(int i = 0; i < count; i++)
std::cout << i << ": " << getDeviceName(i) << std::endl;
}

auto getDeviceName(int deviceIndex) -> std::string
{
const auto platform = alpaka::Platform<Acc>{};
auto const platform = alpaka::Platform<Acc>{};
return alpaka::getName(alpaka::getDevByIdx(platform, deviceIndex));
}

Expand Down
2 changes: 1 addition & 1 deletion examples/alpaka/babelstream/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright 2022 Bernhard Manfred Gruber
# SPDX-License-Identifier: CC0-1.0

cmake_minimum_required (VERSION 3.18.3)
cmake_minimum_required(VERSION 3.22)
project(llama-alpaka-babelstream CXX)

if (NOT TARGET llama::llama)
Expand Down
5 changes: 2 additions & 3 deletions examples/alpaka/babelstream/Stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,9 @@ class Stream
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
};


// Implementation specific device functions
void listDevices(void);
std::string getDeviceName(const int);
std::string getDeviceDriver(const int);
std::string getDeviceName(int const);
std::string getDeviceDriver(int const);

// NOLINTEND
13 changes: 5 additions & 8 deletions examples/alpaka/babelstream/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
#endif

// Default size of 2^25
int ARRAY_SIZE = 33554432;
int ARRAY_SIZE = 33'554'432;
unsigned int num_times = 100;
unsigned int deviceIndex = 0;
bool use_float = false;
Expand All @@ -65,7 +65,7 @@ bool mibibytes = false;
std::string csv_separator = ",";

template<typename T>
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum);
void check_solution(unsigned int const ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum);

template<typename T>
void run();
Expand Down Expand Up @@ -103,7 +103,6 @@ int main(int argc, char* argv[])
run<double>();
}


// Run the 5 main kernels
template<typename T>
std::vector<std::vector<double>> run_all(Stream<T>* stream, T& sum)
Expand Down Expand Up @@ -196,7 +195,6 @@ std::vector<std::vector<double>> run_nstream(Stream<T>* stream)
return timings;
}


// Generic run routine
// Runs the kernel(s) and prints output.
template<typename T>
Expand Down Expand Up @@ -428,9 +426,8 @@ void run()
delete stream;
}


template<typename T>
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
void check_solution(unsigned int const ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
{
// Generate correct solution
T goldA = startA;
Expand Down Expand Up @@ -489,14 +486,14 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
<< std::setprecision(15) << "Sum was " << sum << " but should be " << goldSum << std::endl;
}

int parseUInt(const char* str, unsigned int* output)
int parseUInt(char const* str, unsigned int* output)
{
char* next;
*output = strtoul(str, &next, 10);
return !strlen(next);
}

int parseInt(const char* str, int* output)
int parseInt(char const* str, int* output)
{
char* next;
*output = strtol(str, &next, 10);
Expand Down

0 comments on commit e51a145

Please sign in to comment.