diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..9409415 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,20 @@ +# YAML 1.2 +# Metadata for citation of this software according to the CFF format (https://citation-file-format.github.io/) +cff-version: 1.0.3 +message: If you use this software, please cite it using these metadata. +title: 'Hashinator: A hybrid hashmap designed for heterogeneous computing.' +doi: 10.5281/zenodo.11396297 +authors: +- given-names: Kostis + family-names: Papadakis + affiliation: University of Helsinki +- given-names: Markus + family-names: Battarbee + affiliation: University of Helsinki +- given-names: René + family-names: Widera + affiliation: +version: 1.0.1 +date-released: 2024-05-30 +repository-code: https://github.com/fmihpc/hashinator +license: cc-by-4.0 diff --git a/README.md b/README.md index fa69fab..60c3351 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,11 @@ No installation required. Just include  "hashinator.h" . However, if you plan ## Run the tests ``` -meson wrap install gtest # skip if you have gtest installed system wide and pkg-config can find it. -meson setup build --buildtype=release +cd hashinator +mkdir subprojects +meson wrap install gtest +meson setup build --buildtype=release +meson compile -C build --jobs=8 meson test -C build ``` @@ -150,6 +153,45 @@ int main() hmap.insert(src.data(),src.size()); } ``` +### Hashinator and SplitVector can also be used with external allocators + +```c++ +#include "splitvec.h" +//main.cu +using vector = split::SplitVector>; + + +int main() +{ + vector* vec = new vector{1,2,3,4,5}; + vec->reserve(128); + std::cout<> hmap; + +   //Write +   for (uint32_t i=0 ; i<64; ++i){ +      hmap[i]=rand()%10000; +   } + +   //Read +   for (const auto& i:hmap){ +      std::cout<<"["< -using DefaultMetaAllocator = split::split_unified_allocator; +using DefaultAllocator = split::split_unified_allocator; #define DefaultHasher \ Hashers::Hasher #else template -using DefaultMetaAllocator = split::split_host_allocator; +using DefaultAllocator = split::split_host_allocator; #define DefaultHasher void #endif using MapInfo = Hashinator::Info; -template ::max(), - KEY_TYPE TOMBSTONE = EMPTYBUCKET - 1, class HashFunction = HashFunctions::Fibonacci, - class DeviceHasher = DefaultHasher, class Meta_Allocator = DefaultMetaAllocator> +template >, + KEY_TYPE EMPTYBUCKET = std::numeric_limits::max(),KEY_TYPE TOMBSTONE = EMPTYBUCKET - 1, + class HashFunction = HashFunctions::Fibonacci,class DeviceHasher = DefaultHasher> class Hashmap { private: + + constexpr size_t get_number_of_Ts_for_Map_Info()const noexcept{ + constexpr size_t size_of_T=sizeof(hash_pair); + constexpr size_t size_of_info=sizeof(MapInfo); + if constexpr (size_of_T>size_of_info){ + return 1; + } + return std::ceil(size_of_info/size_of_T); + } + // CUDA device handle Hashmap* device_map; - split::SplitVector>* device_buckets; + split::SplitVector,Allocator>* device_buckets; //~CUDA device handle // Host members - split::SplitVector> buckets; - Meta_Allocator _metaAllocator; // Allocator used to allocate and deallocate memory for metadata + split::SplitVector,Allocator> buckets; + Allocator _allocator; // Allocator used to allocate and deallocate memory for metadata MapInfo* _mapInfo; //~Host members @@ -85,8 +95,7 @@ class Hashmap { void preallocate_device_handles() { #ifndef HASHINATOR_CPU_ONLY_MODE SPLIT_CHECK_ERR(split_gpuMalloc((void**)&device_map, sizeof(Hashmap))); - device_buckets = reinterpret_cast>*>( - reinterpret_cast(device_map) + offsetof(Hashmap, buckets)); + device_buckets = &device_map->buckets; #endif } @@ -106,39 +115,62 @@ class Hashmap { inline void set_status(status code) noexcept { _mapInfo->err = code; } public: - Hashmap() { + //By default we allocate enough space for 1<<5 elements + Hashmap():_allocator(Allocator{}) { + preallocate_device_handles(); + _mapInfo = reinterpret_cast(_allocator.allocate(get_number_of_Ts_for_Map_Info())); + *_mapInfo = MapInfo(5); + buckets = split::SplitVector,Allocator>( + 1 << _mapInfo->sizePower, hash_pair(EMPTYBUCKET, VAL_TYPE()),_allocator); +#ifndef HASHINATOR_CPU_ONLY_MODE + SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice)); +#endif + }; + + Hashmap(const Allocator& allocator):buckets(split::SplitVector,Allocator>( + 1 << 5, hash_pair(EMPTYBUCKET, VAL_TYPE()),_allocator)),_allocator(allocator){ preallocate_device_handles(); - _mapInfo = _metaAllocator.allocate(1); + _mapInfo = reinterpret_cast(_allocator.allocate(get_number_of_Ts_for_Map_Info())); *_mapInfo = MapInfo(5); - buckets = split::SplitVector>( - 1 << _mapInfo->sizePower, hash_pair(EMPTYBUCKET, VAL_TYPE())); #ifndef HASHINATOR_CPU_ONLY_MODE SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice)); #endif }; - Hashmap(int sizepower) { + Hashmap(int sizepower):_allocator(Allocator{}) { + preallocate_device_handles(); + _mapInfo = reinterpret_cast(_allocator.allocate(get_number_of_Ts_for_Map_Info())); + *_mapInfo = MapInfo(sizepower); + buckets = split::SplitVector,Allocator>( + 1 << _mapInfo->sizePower, hash_pair(EMPTYBUCKET, VAL_TYPE()),_allocator); + #ifndef HASHINATOR_CPU_ONLY_MODE + SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice)); +#endif + }; + + Hashmap(int sizepower,const Allocator& allocator):buckets(split::SplitVector,Allocator>( + 1 << sizepower, hash_pair(EMPTYBUCKET, VAL_TYPE()),allocator)),_allocator(allocator){ preallocate_device_handles(); - _mapInfo = _metaAllocator.allocate(1); + _mapInfo = reinterpret_cast(_allocator.allocate(get_number_of_Ts_for_Map_Info())); *_mapInfo = MapInfo(sizepower); - buckets = split::SplitVector>( - 1 << _mapInfo->sizePower, hash_pair(EMPTYBUCKET, VAL_TYPE())); #ifndef HASHINATOR_CPU_ONLY_MODE SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice)); #endif }; - Hashmap(const Hashmap& other) { + Hashmap(const Hashmap& other) { + _allocator=other._allocator; preallocate_device_handles(); - _mapInfo = _metaAllocator.allocate(1); + _mapInfo = reinterpret_cast(_allocator.allocate(get_number_of_Ts_for_Map_Info())); *_mapInfo = *(other._mapInfo); buckets = other.buckets; #ifndef HASHINATOR_CPU_ONLY_MODE SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice)); #endif }; - - Hashmap(Hashmap&& other) { + + Hashmap(Hashmap&& other) { + _allocator=other._allocator; preallocate_device_handles(); _mapInfo = other._mapInfo; other._mapInfo = nullptr; @@ -148,7 +180,8 @@ class Hashmap { #endif }; - Hashmap& operator=(const Hashmap& other) { + //Allocator is not replaces with assignment operation + Hashmap& operator=(const Hashmap& other) { if (this == &other) { return *this; } @@ -162,7 +195,7 @@ class Hashmap { #ifndef HASHINATOR_CPU_ONLY_MODE /** Copy assign but using a provided stream */ - void overwrite(const Hashmap& other, split_gpuStream_t stream = 0) { + void overwrite(const Hashmap& other, split_gpuStream_t stream = 0) { if (this == &other) { return; } @@ -174,11 +207,11 @@ class Hashmap { } #endif - Hashmap& operator=(Hashmap&& other) { + Hashmap& operator=(Hashmap&& other) { if (this == &other) { return *this; } - _metaAllocator.deallocate(_mapInfo, 1); + _allocator.deallocate(reinterpret_cast*>(_mapInfo), get_number_of_Ts_for_Map_Info()); _mapInfo = other._mapInfo; other._mapInfo = nullptr; buckets = std::move(other.buckets); @@ -190,7 +223,7 @@ class Hashmap { ~Hashmap() { deallocate_device_handles(); - _metaAllocator.deallocate(_mapInfo, 1); + _allocator.deallocate(reinterpret_cast*>(_mapInfo), get_number_of_Ts_for_Map_Info()); }; #ifdef HASHINATOR_CPU_ONLY_MODE @@ -237,8 +270,8 @@ class Hashmap { if (newSizePower > 32) { throw std::out_of_range("Hashmap ran into rehashing catastrophe and exceeded 32bit buckets."); } - split::SplitVector> newBuckets( - 1 << newSizePower, hash_pair(EMPTYBUCKET, VAL_TYPE())); + split::SplitVector,Allocator> newBuckets( + 1 << newSizePower, hash_pair(EMPTYBUCKET, VAL_TYPE()),_allocator); _mapInfo->sizePower = newSizePower; int bitMask = (1 << _mapInfo->sizePower) - 1; // For efficient modulo of the array size @@ -322,8 +355,8 @@ class Hashmap { // DeviceHasher::reset_all(buckets.data(),_mapInfo, buckets.size(), s); } else { // Need new buckets - buckets = std::move(split::SplitVector>( - 1 << newSizePower, hash_pair(EMPTYBUCKET, VAL_TYPE()))); + buckets = std::move(split::SplitVector,Allocator>( + 1 << newSizePower, hash_pair(EMPTYBUCKET, VAL_TYPE()),_allocator)); SPLIT_CHECK_ERR(split_gpuMemcpyAsync(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice, s)); optimizeGPU(s); } @@ -483,7 +516,7 @@ class Hashmap { #ifdef HASHINATOR_CPU_ONLY_MODE void clear() { - buckets = split::SplitVector>(1 << _mapInfo->sizePower, {EMPTYBUCKET, VAL_TYPE()}); + buckets = split::SplitVector,Allocator>(1 << _mapInfo->sizePower, {EMPTYBUCKET, VAL_TYPE()},_allocator); *_mapInfo = MapInfo(_mapInfo->sizePower); return; } @@ -493,7 +526,7 @@ class Hashmap { switch (t) { case targets::host: buckets = - split::SplitVector>(1 << _mapInfo->sizePower, {EMPTYBUCKET, VAL_TYPE()}); + split::SplitVector,Allocator>(1 << _mapInfo->sizePower, {EMPTYBUCKET, VAL_TYPE()},_allocator); *_mapInfo = MapInfo(_mapInfo->sizePower); break; @@ -671,11 +704,11 @@ class Hashmap { // Iterator type. Iterates through all non-empty buckets. class iterator { - Hashmap* hashtable; + Hashmap* hashtable; size_t index; public: - iterator(Hashmap& hashtable, size_t index) : hashtable(&hashtable), index(index) {} + iterator(Hashmap& hashtable, size_t index) : hashtable(&hashtable), index(index) {} iterator& operator++() { index++; @@ -706,11 +739,11 @@ class Hashmap { // Const iterator. class const_iterator { - const Hashmap* hashtable; + const Hashmap* hashtable; size_t index; public: - explicit const_iterator(const Hashmap& hashtable, size_t index) + explicit const_iterator(const Hashmap& hashtable, size_t index) : hashtable(&hashtable), index(index) {} const_iterator& operator++() { index++; @@ -1152,7 +1185,7 @@ class Hashmap { * hmap.extractPattern(elements,Rule()); * */ template - size_t extractPattern(split::SplitVector>& elements, Rule rule, + size_t extractPattern(split::SplitVector,Allocator>& elements, Rule rule, split_gpuStream_t s = 0) { elements.resize(_mapInfo->fill + 1, true); if constexpr (prefetches) { @@ -1179,18 +1212,18 @@ class Hashmap { const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); split::tools::splitStackArena mPool(memory_for_pool, s); size_t retval = - split::tools::copy_if_raw, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>( + split::tools::copy_if_raw,Allocator, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>( buckets, elements, rule, nBlocks, mPool, s); return retval; } template - void extractPatternLoop(split::SplitVector>& elements, Rule rule, + void extractPatternLoop(split::SplitVector,Allocator>& elements, Rule rule, split_gpuStream_t s = 0) { // Extract elements matching the Pattern Rule(element)==true; split::tools::copy_if_loop, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>( *device_buckets, elements, rule, s); } - void extractLoop(split::SplitVector>& elements, split_gpuStream_t s = 0) { + void extractLoop(split::SplitVector,Allocator>& elements, split_gpuStream_t s = 0) { // Extract all valid elements auto rule = [] __host__ __device__(const hash_pair& kval) -> bool { return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE; @@ -1214,6 +1247,24 @@ class Hashmap { } return elements.size(); } + + template + size_t extractKeysByPattern(split::SplitVector& elements, Rule rule, split_gpuStream_t s = 0) { + elements.resize(_mapInfo->fill + 1, true); + if constexpr (prefetches) { + elements.optimizeGPU(s); + } + // Extract element **keys** matching the Pattern Rule(element)==true; + split::tools::copy_keys_if, KEY_TYPE,Allocator,KeyAlloc, Rule,defaults::MAX_BLOCKSIZE, + defaults::WARPSIZE>(buckets, elements, rule, s); + // FIXME: there is an issue where paging to host occurs and following calls to hashmap operations take a hit. + // temp fix: call optimizeGPU() here + if constexpr (prefetches) { + optimizeGPU(s); + } + return elements.size(); + } + template size_t extractKeysByPattern(split::SplitVector& elements, Rule rule, void* stack, size_t max_size, split_gpuStream_t s = 0) { @@ -1226,6 +1277,20 @@ class Hashmap { defaults::WARPSIZE>(buckets, elements, rule, stack, max_size, s); return elements.size(); } + + template + size_t extractKeysByPattern(split::SplitVector& elements, Rule rule, void* stack, size_t max_size, + split_gpuStream_t s = 0) { + elements.resize(_mapInfo->fill + 1, true); + if constexpr (prefetches) { + elements.optimizeGPU(s); + } + // Extract element **keys** matching the Pattern Rule(element)==true; + split::tools::copy_keys_if, KEY_TYPE,Allocator,KeyAlloc, Rule, defaults::MAX_BLOCKSIZE, + defaults::WARPSIZE>(buckets, elements, rule, stack, max_size, s); + return elements.size(); + } + template void extractKeysByPatternLoop(split::SplitVector& elements, Rule rule, split_gpuStream_t s = 0) { // Extract element **keys** matching the Pattern Rule(element)==true; @@ -1470,11 +1535,11 @@ class Hashmap { class device_iterator { private: size_t index; - Hashmap* hashtable; + Hashmap* hashtable; public: HASHINATOR_DEVICEONLY - device_iterator(Hashmap& hashtable, size_t index) : index(index), hashtable(&hashtable) {} + device_iterator(Hashmap& hashtable, size_t index) : index(index), hashtable(&hashtable) {} HASHINATOR_DEVICEONLY size_t getIndex() { return index; } @@ -1747,27 +1812,22 @@ class Hashmap { } HASHINATOR_DEVICEONLY - const VAL_TYPE& read_element(const KEY_TYPE& key) const { - int bitMask = (1 << _mapInfo->sizePower) - 1; // For efficient modulo of the array size - auto hashIndex = hash(key); - - // Try to find the matching bucket. - const size_t bsize = buckets.size(); - for (size_t i = 0; i < bsize; i++) { - uint32_t vecindex = (hashIndex + i) & bitMask; - const hash_pair& candidate = buckets[vecindex]; - if (candidate.first == key) { - // Found a match, return that - return candidate.second; - } - if (candidate.first == EMPTYBUCKET) { - // Found an empty bucket, so error. - assert(false && "Key does not exist"); - } - } - assert(false && "Key does not exist"); - } - + std::optional> read_element(const KEY_TYPE& key) const { + int bitMask = (1 << _mapInfo->sizePower) - 1; + auto hashIndex = hash(key); + const size_t bsize = buckets.size(); + for (size_t i = 0; i < bsize; i++) { + uint32_t vecindex = (hashIndex + i) & bitMask; + const hash_pair& candidate = buckets[vecindex]; + if (candidate.first == key) { + return candidate.second; + } + if (candidate.first == EMPTYBUCKET) { + return std::nullopt; + } + } + return std::nullopt; +} #else // Uses Hasher's insert_kernel to insert all elements diff --git a/include/splitvector/split_allocators.h b/include/splitvector/split_allocators.h index 2527400..e905b1b 100644 --- a/include/splitvector/split_allocators.h +++ b/include/splitvector/split_allocators.h @@ -85,7 +85,7 @@ class split_unified_allocator { pointer address(reference x) const { return &x; } const_pointer address(const_reference x) const { return &x; } - pointer allocate(size_type n, const void* /*hint*/ = 0) { + static pointer allocate(size_type n, const void* /*hint*/ = 0) { T* ret; assert(n && "allocate 0"); SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, n * sizeof(value_type))); @@ -95,21 +95,7 @@ class split_unified_allocator { return ret; } - static void* allocate_raw(size_type n, const void* /*hint*/ = 0) { - void* ret; - SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, n)); - if (ret == nullptr) { - throw std::bad_alloc(); - } - return ret; - } - - void deallocate(pointer p, size_type n) { - if (n != 0 && p != 0) { - SPLIT_CHECK_ERR(split_gpuFree(p)); - } - } - static void deallocate(void* p, size_type n) { + static void deallocate(pointer p, size_type n) { if (n != 0 && p != 0) { SPLIT_CHECK_ERR(split_gpuFree(p)); } @@ -129,74 +115,4 @@ class split_unified_allocator { }; #endif - -/** - * @brief Custom allocator for host memory. - * - * This class provides an allocator for host memory, which can be accessed - * by the CPU. It allocates and deallocates memory using malloc and free functions, - * while also providing constructors and destructors for objects. - * - * @tparam T Type of the allocated objects. - */ -template -class split_host_allocator { -public: - typedef T value_type; - typedef value_type* pointer; - typedef const value_type* const_pointer; - typedef value_type& reference; - typedef const value_type& const_reference; - typedef ptrdiff_t difference_type; - typedef size_t size_type; - template - struct rebind { - typedef split_host_allocator other; - }; - - /** - * @brief Default constructor. - */ - split_host_allocator() throw() {} - - /** - * @brief Copy constructor with different type. - */ - template - split_host_allocator(split_host_allocator const&) throw() {} - pointer address(reference x) const { return &x; } - const_pointer address(const_reference x) const { return &x; } - - pointer allocate(size_type n, const void* /*hint*/ = 0) { - pointer const ret = reinterpret_cast(malloc(n * sizeof(value_type))); - if (ret == nullptr) { - throw std::bad_alloc(); - } - return ret; - } - - static void* allocate_raw(size_type n, const void* /*hint*/ = 0) { - void* ret = (void*)malloc(n); - if (ret == nullptr) { - throw std::bad_alloc(); - } - return ret; - } - - void deallocate(pointer p, size_type) { free(p); } - - static void deallocate(void* p, size_type) { free(p); } - - size_type max_size() const throw() { - size_type max = static_cast(-1) / sizeof(value_type); - return (max > 0 ? max : 1); - } - - template - void construct(U* p, Args&&... args) { - ::new (p) U(std::forward(args)...); - } - - void destroy(pointer p) { p->~value_type(); } -}; } // namespace split diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index 045a624..f01e245 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -101,8 +101,8 @@ __global__ void scan_add(T* input, T* partial_sums, size_t blockSize, size_t len * @param rule The rule functor object. */ template -__global__ void scan_reduce(split::SplitVector>* input, - split::SplitVector>* output, Rule rule) { +__global__ void scan_reduce(split::SplitVector* input, + split::SplitVector* output, Rule rule) { size_t size = input->size(); size_t tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -212,10 +212,10 @@ __global__ void split_prescan(T* input, T* output, T* partial_sums, int n, size_ * @param rule The rule functor object. */ template -__global__ void split_compact(split::SplitVector>* input, - split::SplitVector>* counts, - split::SplitVector>* offsets, - split::SplitVector>* output, Rule rule) { +__global__ void split_compact(split::SplitVector* input, + split::SplitVector* counts, + split::SplitVector* offsets, + split::SplitVector* output, Rule rule) { extern __shared__ uint32_t buffer[]; const size_t size = input->size(); const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -259,10 +259,10 @@ __global__ void split_compact(split::SplitVector -__global__ void split_compact_keys(split::SplitVector>* input, - split::SplitVector>* counts, - split::SplitVector>* offsets, - split::SplitVector>* output, Rule rule) { +__global__ void split_compact_keys(split::SplitVector* input, + split::SplitVector* counts, + split::SplitVector* offsets, + split::SplitVector* output, Rule rule) { extern __shared__ uint32_t buffer[]; const size_t size = input->size(); const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -361,8 +361,8 @@ __global__ void split_compact_keys_raw(T* input, uint32_t* counts, uint32_t* off * @param s The split_gpuStream_t stream for GPU execution (default is 0). */ template -void split_prefix_scan(split::SplitVector>& input, - split::SplitVector>& output, split_gpuStream_t s = 0) +void split_prefix_scan(split::SplitVector& input, + split::SplitVector& output, split_gpuStream_t s = 0) { using vector = split::SplitVector>; @@ -716,8 +716,8 @@ __global__ void block_compact_keys(T* input, U* output, size_t inputSize, Rule r } template -__global__ void loop_compact(split::SplitVector>& inputVec, - split::SplitVector>& outputVec, Rule rule) { +__global__ void loop_compact(split::SplitVector& inputVec, + split::SplitVector& outputVec, Rule rule) { // This must be equal to at least both WARPLENGTH and MAX_BLOCKSIZE/WARPLENGTH __shared__ uint32_t warpSums[WARPLENGTH]; __shared__ uint32_t outputCount; @@ -918,7 +918,43 @@ size_t copy_if_keys_block(T* input, U* output, size_t size, Rule rule, splitStac * @brief Same as copy_if but using raw memory */ template -uint32_t copy_if_raw(split::SplitVector>& input, T* output, Rule rule, +uint32_t copy_if_raw(split::SplitVector& input, T* output, Rule rule, + size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) { + + size_t _size = input.size(); + if (_size <= BLOCKSIZE) { + return copy_if_block(input.data(), output, _size, rule, mPool, s); + } + uint32_t* d_counts; + uint32_t* d_offsets; + d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); + SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s)); + + // Phase 1 -- Calculate per warp workload + split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); + d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); + SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t), s)); + + // Step 2 -- Exclusive Prefix Scan on offsets + if (nBlocks == 1) { + split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); + } else { + split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); + } + + // Step 3 -- Compaction + uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); + split::tools::split_compact_raw + <<>>(input.data(), d_counts, d_offsets, + output, rule, _size, nBlocks, retval); + uint32_t numel; + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + return numel; +} + +template +uint32_t copy_if_raw(split::SplitVector& input, T* output, Rule rule, size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) { size_t _size = input.size(); @@ -994,8 +1030,8 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, size_t nBlocks */ template -void copy_if_loop(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, +void copy_if_loop(split::SplitVector& input, + split::SplitVector& output, Rule rule, split_gpuStream_t s = 0) { #ifdef HASHINATOR_DEBUG bool input_ok = isDeviceAccessible(reinterpret_cast(&input)); @@ -1007,8 +1043,8 @@ void copy_if_loop(split::SplitVector>& inpu } template -void copy_if_keys_loop(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, +void copy_if_keys_loop(split::SplitVector& input, + split::SplitVector& output, Rule rule, split_gpuStream_t s = 0) { #ifdef HASHINATOR_DEBUG bool input_ok = isDeviceAccessible(reinterpret_cast(&input)); @@ -1023,7 +1059,43 @@ void copy_if_keys_loop(split::SplitVector>& * @brief Same as copy_keys_if but using raw memory */ template -size_t copy_keys_if_raw(split::SplitVector>& input, U* output, Rule rule, +size_t copy_keys_if_raw(split::SplitVector& input, U* output, Rule rule, + size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) { + + size_t _size = input.size(); + if (_size <= BLOCKSIZE) { + return copy_if_keys_block(input.data(), output, _size, rule, mPool, s); + } + uint32_t* d_counts; + uint32_t* d_offsets; + d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); + SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s)); + + // Phase 1 -- Calculate per warp workload + split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); + d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); + SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t), s)); + + // Step 2 -- Exclusive Prefix Scan on offsets + if (nBlocks == 1) { + split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); + } else { + split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); + } + + // Step 3 -- Compaction + uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); + split::tools::split_compact_keys_raw + <<>>(input.data(), d_counts, d_offsets, + output, rule, _size, nBlocks, retval); + uint32_t numel; + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + return numel; +} + +template +size_t copy_keys_if_raw(split::SplitVector& input, U* output, Rule rule, size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) { size_t _size = input.size(); @@ -1058,12 +1130,13 @@ size_t copy_keys_if_raw(split::SplitVector> return numel; } + /** * @brief Estimates memory needed for compacting the input splitvector */ template [[nodiscard]] size_t -estimateMemoryForCompaction(const split::SplitVector>& input) noexcept { +estimateMemoryForCompaction(const split::SplitVector& input) noexcept { // Figure out Blocks to use size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE); size_t nBlocks = nextPow2(_s); @@ -1092,8 +1165,27 @@ template * @brief Same as copy_if but only for Hashinator keys */ template -void copy_keys_if(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, +void copy_keys_if(split::SplitVector& input, + split::SplitVector& output, Rule rule, + split_gpuStream_t s = 0) { + + // Figure out Blocks to use + size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE); + size_t nBlocks = nextPow2(_s); + if (nBlocks == 0) { + nBlocks += 1; + } + + // Allocate with Mempool + const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); + splitStackArena mPool(memory_for_pool, s); + auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); + output.erase(&output[len], output.end()); +} + +template +void copy_keys_if(split::SplitVector& input, + split::SplitVector& output, Rule rule, split_gpuStream_t s = 0) { // Figure out Blocks to use @@ -1126,8 +1218,8 @@ void copy_keys_if(split::SplitVector>& inpu * @param s The split_gpuStream_t stream for GPU execution (default is 0). */ template -void copy_if(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, split_gpuStream_t s = 0) { +void copy_if(split::SplitVector& input, + split::SplitVector& output, Rule rule, split_gpuStream_t s = 0) { // Figure out Blocks to use size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE); @@ -1143,9 +1235,27 @@ void copy_if(split::SplitVector>& input, output.erase(&output[len], output.end()); } -template -void copy_keys_if(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, splitStackArena&& mPool, +template +void copy_if(split::SplitVector& input, + split::SplitVector& output, Rule rule, split_gpuStream_t s = 0) { + + // Figure out Blocks to use + size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE); + size_t nBlocks = nextPow2(_s); + if (nBlocks == 0) { + nBlocks += 1; + } + + // Allocate with Mempool + const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); + splitStackArena mPool(memory_for_pool, s); + auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); + output.erase(&output[len], output.end()); +} + +template +void copy_keys_if(split::SplitVector& input, + split::SplitVector& output, Rule rule, splitStackArena&& mPool, split_gpuStream_t s = 0) { // Figure out Blocks to use @@ -1159,8 +1269,8 @@ void copy_keys_if(split::SplitVector>& inpu } template -void copy_if(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, splitStackArena&& mPool, +void copy_if(split::SplitVector& input, + split::SplitVector& output, Rule rule, splitStackArena&& mPool, split_gpuStream_t s = 0) { // Figure out Blocks to use @@ -1173,9 +1283,9 @@ void copy_if(split::SplitVector>& input, output.erase(&output[len], output.end()); } -template -void copy_keys_if(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, void* stack, +template +void copy_keys_if(split::SplitVector& input, + split::SplitVector& output, Rule rule, void* stack, size_t max_size, split_gpuStream_t s = 0) { // Figure out Blocks to use @@ -1190,9 +1300,27 @@ void copy_keys_if(split::SplitVector>& inpu output.erase(&output[len], output.end()); } -template -void copy_if(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, void* stack, size_t max_size, +template +void copy_keys_if(split::SplitVector& input, + split::SplitVector& output, Rule rule, void* stack, + size_t max_size, split_gpuStream_t s = 0) { + + // Figure out Blocks to use + size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE); + size_t nBlocks = nextPow2(_s); + if (nBlocks == 0) { + nBlocks += 1; + } + assert(stack && "Invalid stack!"); + splitStackArena mPool(stack, max_size); + auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); + output.erase(&output[len], output.end()); +} + + +template +void copy_if(split::SplitVector& input, + split::SplitVector& output, Rule rule, void* stack, size_t max_size, split_gpuStream_t s = 0) { // Figure out Blocks to use diff --git a/include/splitvector/splitvec.h b/include/splitvector/splitvec.h index fbbb08f..b96c0a1 100644 --- a/include/splitvector/splitvec.h +++ b/include/splitvector/splitvec.h @@ -24,11 +24,13 @@ #pragma once #include "split_allocators.h" #include +#include #include #include #include #include #include +#include #include #ifndef SPLIT_CPU_ONLY_MODE @@ -47,7 +49,7 @@ using DefaultAllocator = split::split_unified_allocator; #define DEVICEONLY #define HOSTDEVICE template -using DefaultAllocator = split::split_host_allocator; +using DefaultAllocator = std::allocator; #endif namespace split { @@ -84,14 +86,22 @@ template > class SplitVector { private: + Allocator _allocator; // Allocator used to allocate and deallocate memory; T* _data = nullptr; // actual pointer to our data - size_t* _size; // number of elements in vector. - size_t* _capacity; // number of allocated elements + SplitInfo* _info; // stores size and capacity size_t _alloc_multiplier = 2; // host variable; multiplier for when reserving more space - Allocator _allocator; // Allocator used to allocate and deallocate memory; Residency _location; // Flags that describes the current residency of our data SplitVector* d_vec = nullptr; // device copy pointer + constexpr size_t get_number_of_Ts_for_Split_Info()const noexcept{ + constexpr size_t size_of_T=sizeof(T); + constexpr size_t size_of_info=sizeof(SplitInfo); + if constexpr (size_of_T>size_of_info){ + return 1; + } + return std::ceil(size_of_info/size_of_T); + } + /** * @brief Checks if a pointer is valid and throws an exception if it's null. * @param ptr Pointer to be checked. @@ -113,16 +123,19 @@ class SplitVector { /** * @brief Allocates memory for the vector on the host. - * + * There is a small hack done here to allow us to only use one allocator. + * We allocate memory for _info which is of type SplitInfo using an + * allocator of T and properly casting. * @param size Number of elements to allocate. * @throws std::bad_alloc If memory allocation fails. */ HOSTONLY void _allocate(size_t size) { - _size = _allocate_and_construct(size); - _capacity = _allocate_and_construct(size); - _check_ptr(_size); - _check_ptr(_capacity); - if (size == 0) { + auto n = get_number_of_Ts_for_Split_Info(); + _info = reinterpret_cast(_allocator.allocate(n)); + _check_ptr(_info); + _info->size=size; + _info->capacity=size; + if (size==0){ return; } _data = _allocate_and_construct(size, T()); @@ -141,8 +154,8 @@ class SplitVector { _deallocate_and_destroy(capacity(), _data); _data = nullptr; } - _deallocate_and_destroy(_capacity); - _deallocate_and_destroy(_size); + _allocator.deallocate(reinterpret_cast(_info),get_number_of_Ts_for_Split_Info()); + _info=nullptr; } /** @@ -154,25 +167,14 @@ class SplitVector { */ HOSTONLY T* _allocate_and_construct(size_t n, const T& val) { T* _ptr = _allocator.allocate(n); - for (size_t i = 0; i < n; i++) { - _allocator.construct(&_ptr[i], val); + if constexpr (!std::is_trivially_copy_constructible_v ){ + for (size_t i = 0; i < n; i++) { + _allocator.construct(&_ptr[i], val); + } } return _ptr; } - /** - * @brief Allocates memory and constructs metadata on the host. - * - * @param val Value to be used for construction. - * @return Pointer to the allocated and constructed memory. - */ - HOSTONLY size_t* _allocate_and_construct(const size_t& val) { - size_t* _ptr = (size_t*)_allocator.allocate_raw(sizeof(size_t)); - assert(_ptr); - *_ptr = val; - return _ptr; - } - /** * @brief Deallocates memory and destroys elements on the host. * @@ -180,26 +182,17 @@ class SplitVector { * @param _ptr Pointer to the memory to be deallocated and destroyed. */ HOSTONLY void _deallocate_and_destroy(size_t n, T* _ptr) { - for (size_t i = 0; i < n; i++) { - _allocator.destroy(&_ptr[i]); + if constexpr (!std::is_trivially_copy_constructible_v){ + for (size_t i = 0; i < n; i++) { + _allocator.destroy(&_ptr[i]); + } } _allocator.deallocate(_ptr, n); } - /** - * @brief Deallocates memory for metadata on the host. - * - * @param ptr Pointer to the memory to be deallocated. - */ - HOSTONLY void _deallocate_and_destroy(size_t* ptr) { - if (ptr) { - _allocator.deallocate(ptr, 1); - } - } - public: /* Available Constructors : - * -- SplitVector() --> Default constructor. Almost a no OP but _size and _capacity have to + * -- SplitVector() --> Default constructor. Almost a no OP but info->_size and info->_capacity have to * be allocated for device usage. * -- SplitVector(size_t) --> Instantiates a splitvector with a specific size. (capacity == size) * -- SplitVector(size_t,T) --> Instantiates a splitvector with a specific size and sets all @@ -213,16 +206,23 @@ class SplitVector { /** * @brief Default constructor. Creates an empty SplitVector. */ - HOSTONLY explicit SplitVector() : _location(Residency::host), d_vec(nullptr) { + HOSTONLY explicit SplitVector() :_allocator(Allocator()), _location(Residency::host), d_vec(nullptr) { this->_allocate(0); // seems counter-intuitive based on stl but it is not! } + + HOSTONLY explicit SplitVector(const Allocator& alloc) :_allocator(alloc), _location(Residency::host), d_vec(nullptr) { + this->_allocate(0); + } /** * @brief Constructor to create a SplitVector of a specified size. * * @param size The size of the SplitVector to be created. */ - HOSTONLY explicit SplitVector(size_t size) : _location(Residency::host), d_vec(nullptr) { this->_allocate(size); } + HOSTONLY explicit SplitVector(size_t size) : _allocator(Allocator()),_location(Residency::host), d_vec(nullptr) { this->_allocate(size); } + + HOSTONLY explicit SplitVector(size_t size,const Allocator& alloc) : _allocator(alloc),_location(Residency::host), d_vec(nullptr) { this->_allocate(size); } + /** * @brief Constructor to create a SplitVector of a specified size with initial values. @@ -230,7 +230,14 @@ class SplitVector { * @param size The size of the SplitVector to be created. * @param val The initial value to be assigned to each element. */ - HOSTONLY explicit SplitVector(size_t size, const T& val) : _location(Residency::host), d_vec(nullptr) { + HOSTONLY explicit SplitVector(size_t size, const T& val) :_allocator(Allocator()), _location(Residency::host), d_vec(nullptr) { + this->_allocate(size); + for (size_t i = 0; i < size; i++) { + _data[i] = val; + } + } + + HOSTONLY explicit SplitVector(size_t size, const T& val,const Allocator& alloc) :_allocator(alloc), _location(Residency::host), d_vec(nullptr) { this->_allocate(size); for (size_t i = 0; i < size; i++) { _data[i] = val; @@ -243,7 +250,7 @@ class SplitVector { * @param other The SplitVector to be copied. */ #ifdef SPLIT_CPU_ONLY_MODE - HOSTONLY explicit SplitVector(const SplitVector& other) { + HOSTONLY explicit SplitVector(const SplitVector& other):_allocator(other._allocator) { const size_t size_to_allocate = other.size(); this->_allocate(size_to_allocate); for (size_t i = 0; i < size_to_allocate; i++) { @@ -252,7 +259,7 @@ class SplitVector { } #else - HOSTONLY explicit SplitVector(const SplitVector& other) { + HOSTONLY explicit SplitVector(const SplitVector& other):_allocator(other._allocator) { const size_t size_to_allocate = other.size(); auto copySafe = [&]() -> void { for (size_t i = 0; i < size_to_allocate; i++) { @@ -280,11 +287,12 @@ class SplitVector { * @param other The SplitVector to be moved from. */ HOSTONLY SplitVector(SplitVector&& other) noexcept { + _allocator=other._allocator; _data = other._data; - *_size = other.size(); - *_capacity = other.capacity(); - *(other._capacity) = 0; - *(other._size) = 0; + _info->size = other.size(); + _info->capacity = other.capacity(); + _info->capacity = 0; + _info->size = 0; other._data = nullptr; _location = other._location; d_vec = nullptr; @@ -295,7 +303,7 @@ class SplitVector { * * @param init_list The initializer list to initialize the SplitVector with. */ - HOSTONLY explicit SplitVector(std::initializer_list init_list) : _location(Residency::host), d_vec(nullptr) { + HOSTONLY explicit SplitVector(std::initializer_list init_list) :_allocator(Allocator()), _location(Residency::host), d_vec(nullptr) { this->_allocate(init_list.size()); for (size_t i = 0; i < size(); i++) { _data[i] = init_list.begin()[i]; @@ -307,7 +315,7 @@ class SplitVector { * * @param other The std::vector to initialize the SplitVector with. */ - HOSTONLY explicit SplitVector(const std::vector& other) : _location(Residency::host), d_vec(nullptr) { + HOSTONLY explicit SplitVector(const std::vector& other) :_allocator(Allocator()), _location(Residency::host), d_vec(nullptr) { this->_allocate(other.size()); for (size_t i = 0; i < size(); i++) { _data[i] = other[i]; @@ -395,8 +403,7 @@ class SplitVector { } SPLIT_CHECK_ERR( split_gpuMemcpyAsync(_data, other._data, size() * sizeof(T), split_gpuMemcpyDeviceToDevice, stream)); - SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_size, sizeof(size_t), device, stream)); - SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), device, stream)); + SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_info, sizeof(SplitInfo), device, stream)); return; } } @@ -421,16 +428,17 @@ class SplitVector { _deallocate_and_destroy(capacity(), _data); _data = other._data; - *_size = other.size(); - *_capacity = other.capacity(); - *(other._capacity) = 0; - *(other._size) = 0; + _info->size = other.size(); + _info->capacity= other.capacity(); + other._info->capacity = 0; + other._info->size = 0; other._data = nullptr; _location = other._location; d_vec = nullptr; return *this; } + #ifndef SPLIT_CPU_ONLY_MODE /** * @brief Custom new operator for allocation using the allocator. * @@ -438,9 +446,13 @@ class SplitVector { * @return Pointer to the allocated memory. */ HOSTONLY - void* operator new(size_t len) { - void* ptr = Allocator::allocate_raw(len); - return ptr; + void* operator new(size_t size) { + void* ret; + SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, size)); + if (ret == nullptr) { + throw std::bad_alloc(); + } + return ret; } /** @@ -449,7 +461,9 @@ class SplitVector { * @param ptr Pointer to the memory to deallocate. */ HOSTONLY - void operator delete(void* ptr) { Allocator::deallocate(ptr, 1); } + void operator delete(void* ptr) { + SPLIT_CHECK_ERR(split_gpuFree(ptr)); + } /** * @brief Custom new operator for array allocation using the allocator. @@ -458,9 +472,12 @@ class SplitVector { * @return Pointer to the allocated memory. */ HOSTONLY - void* operator new[](size_t len) { - void* ptr = Allocator::allocate_raw(len); - return ptr; + void* operator new[](size_t size) { + void* ret; + SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, size)); + if (ret == nullptr) { + throw std::bad_alloc(); + } } /** @@ -469,9 +486,8 @@ class SplitVector { * @param ptr Pointer to the memory to deallocate. */ HOSTONLY - void operator delete[](void* ptr) { Allocator::deallocate(ptr); } - -#ifndef SPLIT_CPU_ONLY_MODE + void operator delete[](void* ptr) { SPLIT_CHECK_ERR(split_gpuFree(ptr));} + /** * @brief Uploads the SplitVector to the GPU. * @@ -507,18 +523,17 @@ class SplitVector { int device; SPLIT_CHECK_ERR(split_gpuGetDevice(&device)); - // First make sure _capacity does not page-fault ie prefetch it to host - // This is done because _capacity would page-fault otherwise as pointed by Markus - SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), split_gpuCpuDeviceId, stream)); + // First make sure info->_capacity does not page-fault ie prefetch it to host + // This is done because info->_capacity would page-fault otherwise as pointed by Markus + SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_info, sizeof(SplitInfo), split_gpuCpuDeviceId, stream)); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream)); - if (*_capacity == 0) { + if (_info->capacity == 0) { return; } // Now prefetch everything to device SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_data, capacity() * sizeof(T), device, stream)); - SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_size, sizeof(size_t), device, stream)); - SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), device, stream)); + SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_info, sizeof(SplitInfo), device, stream)); } /** @@ -528,10 +543,9 @@ class SplitVector { */ HOSTONLY void optimizeCPU(split_gpuStream_t stream = 0) noexcept { _location = Residency::host; - SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), split_gpuCpuDeviceId, stream)); - SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_size, sizeof(size_t), split_gpuCpuDeviceId, stream)); + SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_info, sizeof(SplitInfo), split_gpuCpuDeviceId, stream)); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream)); - if (*_capacity == 0) { + if (_info->capacity == 0) { return; } SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_data, capacity() * sizeof(T), split_gpuCpuDeviceId, stream)); @@ -544,9 +558,8 @@ class SplitVector { * @param flags Flags for memory attachment. */ HOSTONLY void streamAttach(split_gpuStream_t s, uint32_t flags = split_gpuMemAttachSingle) { - SPLIT_CHECK_ERR(split_gpuStreamAttachMemAsync(s, (void*)_size, sizeof(size_t), flags)); - SPLIT_CHECK_ERR(split_gpuStreamAttachMemAsync(s, (void*)_capacity, sizeof(size_t), flags)); - SPLIT_CHECK_ERR(split_gpuStreamAttachMemAsync(s, (void*)_data, *_capacity * sizeof(T), flags)); + SPLIT_CHECK_ERR(split_gpuStreamAttachMemAsync(s, (void*)_info, sizeof(SplitInfo), flags)); + SPLIT_CHECK_ERR(split_gpuStreamAttachMemAsync(s, (void*)_data, _info->capacity * sizeof(T), flags)); return; } @@ -566,8 +579,7 @@ class SplitVector { * @param s The GPU stream to perform the copy on. */ HOSTONLY void copyMetadata(SplitInfo* dst, split_gpuStream_t s = 0) { - SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&dst->size, _size, sizeof(size_t), split_gpuMemcpyDeviceToHost, s)); - SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&dst->capacity, _capacity, sizeof(size_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&dst->size, _info, sizeof(SplitInfo), split_gpuMemcpyDeviceToHost, s)); } /** @@ -581,12 +593,10 @@ class SplitVector { if (device == -1) { SPLIT_CHECK_ERR(split_gpuGetDevice(&device)); } - SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), split_gpuCpuDeviceId, stream)); + SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_info, sizeof(SplitInfo), split_gpuCpuDeviceId, stream)); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream)); SPLIT_CHECK_ERR(split_gpuMemAdvise(_data, capacity() * sizeof(T), advice, device)); - SPLIT_CHECK_ERR(split_gpuMemAdvise(_size, sizeof(size_t), advice, device)); - SPLIT_CHECK_ERR(split_gpuMemAdvise(_capacity, sizeof(size_t), advice, device)); - SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), device, stream)); + SPLIT_CHECK_ERR(split_gpuMemAdvise(_info, sizeof(SplitInfo), advice, device)); } #endif @@ -602,8 +612,7 @@ class SplitVector { return; } split::swap(_data, other._data); - split::swap(_size, other._size); - split::swap(_capacity, other._capacity); + split::swap(_info, other._info); split::swap(_allocator, other._allocator); return; } @@ -614,7 +623,7 @@ class SplitVector { * * @return Number of elements in the container. */ - HOSTDEVICE const size_t& size() const noexcept { return *_size; } + HOSTDEVICE size_t size() const noexcept { return _info->size; } /** * @brief Bracket accessor for accessing elements by index without bounds check. @@ -680,8 +689,8 @@ class SplitVector { _deallocate_and_destroy(capacity(), _data); } _data = nullptr; - *_capacity = 0; - *_size = 0; + _info->capacity = 0; + _info->size = 0; return; } T* _new_data; @@ -703,7 +712,7 @@ class SplitVector { // Swap pointers & update capacity // Size remains the same ofc _data = _new_data; - *_capacity = requested_space; + _info->capacity = requested_space; return; } @@ -718,18 +727,20 @@ class SplitVector { * will be invalidated after a call. */ void reserve(size_t requested_space, bool eco = false) { - size_t current_space = *_capacity; + size_t current_space = _info->capacity; // Vector was default initialized if (_data == nullptr) { _deallocate(); _allocate(requested_space); - *_size = 0; + _info->size = 0; return; } // Nope. if (requested_space <= current_space) { - for (size_t i = size(); i < requested_space; ++i) { - _allocator.construct(&_data[i], T()); + if constexpr (!std::is_trivial::value) { + for (size_t i = size(); i < requested_space; ++i) { + _allocator.construct(&_data[i], T()); + } } return; } @@ -756,11 +767,11 @@ class SplitVector { void resize(size_t newSize, bool eco = false) { // Let's reserve some space and change our size if (newSize <= size()) { - *_size = newSize; + _info->size = newSize; return; } reserve(newSize, eco); - *_size = newSize; + _info->size = newSize; // TODO: should it set entries to zero? } @@ -773,8 +784,8 @@ class SplitVector { * @brief Reduce the capacity of the SplitVector to match its size. */ void shrink_to_fit() { - size_t curr_cap = *_capacity; - size_t curr_size = *_size; + size_t curr_cap = _info->capacity; + size_t curr_size = _info->size; if (curr_cap == curr_size) { return; @@ -794,8 +805,8 @@ class SplitVector { HOSTONLY void reallocate(size_t requested_space, split_gpuStream_t stream = 0) { // Store addresses - const size_t __size = *_size; - const size_t __old_capacity = *_capacity; + const size_t __size = _info->size; + const size_t __old_capacity = _info->capacity; T* __old_data = _data; // Verify allocation sufficiency if (__size > requested_space) { @@ -809,8 +820,8 @@ class SplitVector { _deallocate_and_destroy(__old_capacity, __old_data); } _data = nullptr; - *_capacity = 0; - *_size = 0; + _info->capacity = 0; + _info->size = 0; return; } T* _new_data; @@ -823,7 +834,7 @@ class SplitVector { T* __new_data = _new_data; // Swap pointers & update capacity _data = _new_data; - *_capacity = requested_space; + _info->capacity = requested_space; // Perform copy on device if (__size > 0) { SPLIT_CHECK_ERR( @@ -852,11 +863,11 @@ class SplitVector { if (_data == nullptr) { _deallocate(); _allocate(requested_space); - *_size = 0; + _info->size = 0; return; } // Already has sufficient capacity? - const size_t current_space = *_capacity; + const size_t current_space = _info->capacity; if (requested_space <= current_space) { return; } @@ -883,13 +894,19 @@ class SplitVector { */ HOSTONLY void resize(size_t newSize, bool eco = false, split_gpuStream_t stream = 0) { + if (_data==nullptr){ + _data=_allocator.allocate(newSize); + _info->size = newSize; + _info->capacity = newSize; + return; + } // Let's reserve some space and change our size if (newSize <= size()) { - *_size = newSize; + _info->size = newSize; return; } reserve(newSize, eco, stream); - *_size = newSize; + _info->size = newSize; // TODO: should it set entries to zero? } @@ -899,16 +916,16 @@ class SplitVector { * @param newSize The new size of the SplitVector. */ DEVICEONLY - void device_resize(size_t newSize, bool construct = true) { + void device_resize(size_t newSize,bool _construct=true) { if (newSize > capacity()) { assert(0 && "Splitvector has a catastrophic failure trying to resize on device."); } - if (construct) { + if constexpr (!std::is_trivially_copy_constructible_v ){ for (size_t i = size(); i < newSize; ++i) { _allocator.construct(&_data[i], T()); } } - *_size = newSize; + _info->size = newSize; } /** @@ -922,8 +939,8 @@ class SplitVector { */ HOSTONLY void shrink_to_fit(split_gpuStream_t stream = 0) { - size_t curr_cap = *_capacity; - size_t curr_size = *_size; + size_t curr_cap = _info->capacity; + size_t curr_size = _info->size; if (curr_cap == curr_size) { return; @@ -958,7 +975,7 @@ class SplitVector { (_data + --i)->~T(); } } - *_size = end; + _info->size = end; } /** @@ -971,7 +988,7 @@ class SplitVector { _data[i].~T(); } } - *_size = 0; + _info->size = 0; return; } @@ -981,7 +998,7 @@ class SplitVector { * @return The capacity of the SplitVector. */ HOSTDEVICE - inline size_t capacity() const noexcept { return *_capacity; } + inline size_t capacity() const noexcept { return _info->capacity; } /** * @brief Get a reference to the last element of the SplitVector. @@ -989,7 +1006,7 @@ class SplitVector { * @return Reference to the last element. */ HOSTDEVICE - T& back() noexcept { return _data[*_size - 1]; } + T& back() noexcept { return _data[_info->size - 1]; } /** * @brief Get a const reference to the last element of the SplitVector. @@ -997,7 +1014,7 @@ class SplitVector { * @return Const reference to the last element. */ HOSTDEVICE - const T& back() const noexcept { return _data[*_size - 1]; } + const T& back() const noexcept { return _data[_info->size - 1]; } /** * @brief Get a reference to the first element of the SplitVector. @@ -1030,16 +1047,17 @@ class SplitVector { */ HOSTONLY void push_back(const T& val) { - // If we have no allocated memory because the default ctor was used then - // allocate one element, set it and return - if (_data == nullptr) { - *this = SplitVector(1, val); - return; - } resize(size() + 1); _data[size() - 1] = val; return; } + + HOSTONLY + void push_back_unsafe(const T& val) { + _data[size() - 1] = val; + _info->size++; + return; + } /** * @brief Push a moved element to the back of the SplitVector. @@ -1051,10 +1069,11 @@ class SplitVector { // If we have no allocated memory because the default ctor was used then // allocate one element, set it and return - if (_data == nullptr) { - *this = SplitVector(1, std::move(val)); - return; - } + // if (_data == nullptr) { + // _allocate(1); + // _data[size() - 1] = val; + // return; + // } resize(size() + 1); _data[size() - 1] = std::move(val); return; @@ -1068,9 +1087,9 @@ class SplitVector { */ DEVICEONLY bool device_push_back(const T& val) { - size_t old = atomicAdd((unsigned int*)_size, 1); + size_t old = atomicAdd((unsigned int*)(&_info->size), 1); if (old >= capacity() - 1) { - atomicSub((unsigned int*)_size, 1); + atomicSub((unsigned int*)&_info->size, 1); return false; } atomicCAS(&(_data[old]), _data[old], val); @@ -1087,9 +1106,9 @@ class SplitVector { // We need at least capacity=size+1 otherwise this // pushback cannot be done - size_t old = atomicAdd((unsigned int*)_size, 1); + size_t old = atomicAdd((unsigned int*)&_info->size, 1); if (old >= capacity() - 1) { - atomicSub((unsigned int*)_size, 1); + atomicSub((unsigned int*)&_info->size, 1); return false; } atomicCAS(&(_data[old]), _data[old], std::move(val)); @@ -1286,7 +1305,7 @@ class SplitVector { _data[i + 1] = _data[i]; } _data[index] = val; - *_size = *_size + 1; + _info->size = _info->size + 1; return iterator(_data + index); } @@ -1483,7 +1502,7 @@ class SplitVector { new (&_data[i]) T(_data[i + 1]); } } - *_size -= 1; + _info->size -= 1; iterator retval = &_data[index]; return retval; } @@ -1515,7 +1534,7 @@ class SplitVector { new (&_data[i]) T(_data[i + range]); } } - *_size -= end - start; + _info->size -= end - start; iterator it = &_data[start]; return it; } @@ -1540,8 +1559,13 @@ class SplitVector { resize(size() + 1); iterator it = &_data[index]; std::move(it.data(), end().data(), it.data() + 1); - _allocator.destroy(it.data()); - _allocator.construct(it.data(), args...); + if constexpr (!std::is_trivial::value) { + _allocator.destroy(it.data()); + _allocator.construct(it.data(), args...); + }else{ + //just forward these guys and assign sinced they are trivial types + *it = T(std::forward(args)...); + } return it; } diff --git a/meson.build b/meson.build index 5c10eb6..f79df0d 100644 --- a/meson.build +++ b/meson.build @@ -5,8 +5,9 @@ add_global_arguments('-DHASHMAPDEBUG', language : 'cpp') add_global_arguments('-DHASHMAPDEBUG', language : 'cuda') add_project_arguments(['--expt-relaxed-constexpr','--expt-extended-lambda' ], language: 'cuda') -#Dependencies + #Dependencies gtest_dep = dependency('gtest', fallback : ['gtest', 'gtest_dep']) +umpire_dep = dependency('umpire' ,required:false) #Limit register usage in debug builds if ( get_option('buildtype') == 'debug') @@ -31,8 +32,16 @@ tombstoneTest = executable('tbPerf', 'unit_tests/benchmark/tbPerf.cu', dependenc realisticTest = executable('realistic', 'unit_tests/benchmark/realistic.cu', dependencies :gtest_dep) hybridGPU = executable('hybrid_gpu', 'unit_tests/hybrid/main.cu',dependencies :gtest_dep ) +if umpire_dep.found() + hashinator_umpire_unit = executable('hashmap_umpire_test', 'unit_tests/umpire_tests/hashinator.cu',dependencies :[umpire_dep,gtest_dep]) + splitvector_host_umpire_unit = executable('splitvector_host_umpire_test', 'unit_tests/umpire_tests/vec_host.cu',dependencies :[umpire_dep,gtest_dep] ) + splitvector_device_umpire_unit = executable('splitvector_device_umpire_test', 'unit_tests/umpire_tests/vec_device.cu',dependencies :[umpire_dep,gtest_dep]) + test('HashinatorTestUmpire', hashinator_umpire_unit) + test('SplitVectorUmpireHost', splitvector_host_umpire_unit) + test('SplitVectorUmpireDevice', splitvector_device_umpire_unit) +endif -#Test-Runner +#Test-Runners test('HashinatorTest', hashinator_unit) test('SplitVectorDeviceTest', splitvector_device_unit) test('SplitVectorHostTest', splitvector_host_unit) diff --git a/unit_tests/gtest_vec_host/vec_test.cu b/unit_tests/gtest_vec_host/vec_test.cu index efd65a6..a461ee4 100644 --- a/unit_tests/gtest_vec_host/vec_test.cu +++ b/unit_tests/gtest_vec_host/vec_test.cu @@ -13,10 +13,10 @@ #define expect_eq EXPECT_EQ #define N 1<<12 -typedef split::SplitVector vec ; +typedef split::SplitVector> vec ; typedef std::vector stdvec ; -typedef split::SplitVector> vec2d ; -typedef split::SplitVector::iterator split_iterator; +typedef split::SplitVector>,std::allocator>>> vec2d ; +typedef split::SplitVector>::iterator split_iterator; @@ -228,7 +228,7 @@ TEST(Vector_Functionality , Push_Back_2){ TEST(Vector_Functionality , Insert_1_Element){ { - split::SplitVector a{1,2,3,4,5,6,7,8,9,10}; + vec a{1,2,3,4,5,6,7,8,9,10}; auto s0=a.size(); auto c0=a.capacity(); auto it(a.begin()); auto it2=a.insert(it,-1); @@ -282,7 +282,7 @@ TEST(Vector_Functionality , Insert_1_Element){ TEST(Vector_Functionality , Insert_Many_Elements){ { - split::SplitVector a{1,2,3,4,5,6,7,8,9,10}; + vec a{1,2,3,4,5,6,7,8,9,10}; auto s0=a.size(); auto c0=a.capacity(); auto it(a.begin()); auto it2=a.insert(it,10,-1); @@ -330,9 +330,9 @@ TEST(Vector_Functionality , Insert_Many_Elements){ TEST(Vector_Functionality , Insert_Range_Based){ { - split::SplitVector a{1,2,3,4,5,6,7,8,9,10}; + vec a{1,2,3,4,5,6,7,8,9,10}; auto backup(a); - split::SplitVector b{-1,-2,-3,-4,-5,-6,-7,-8,-9,-10}; + vec b{-1,-2,-3,-4,-5,-6,-7,-8,-9,-10}; auto s0=a.size(); auto it(a.end()); auto it_b0(b.begin()); @@ -350,9 +350,9 @@ TEST(Vector_Functionality , Insert_Range_Based){ { - split::SplitVector a{1,2,3,4,5,6,7,8,9,10}; + vec a{1,2,3,4,5,6,7,8,9,10}; auto backup(a); - split::SplitVector b{-1,-2,-3,-4,-5,-6,-7,-8,-9,-10}; + vec b{-1,-2,-3,-4,-5,-6,-7,-8,-9,-10}; auto s0=a.size(); auto it(a.end()); auto it_b0(b.begin()); @@ -369,16 +369,16 @@ TEST(Vector_Functionality , Insert_Range_Based){ TEST(Vector_Functionality , Erase_Single){ - split::SplitVector a{1,2,3,4,5,6,7,8,9,10}; - split::SplitVector b{1,2,3,5,6,7,8,9,10}; + vec a{1,2,3,4,5,6,7,8,9,10}; + vec b{1,2,3,5,6,7,8,9,10}; split_iterator it0=&a[3]; a.erase(it0); expect_true(a==b); } TEST(Vector_Functionality , Erase_Range){ - split::SplitVector a{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}; - split::SplitVector b{0,1,7,8,9,10,11,12,13,14}; + vec a{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}; + vec b{0,1,7,8,9,10,11,12,13,14}; split_iterator it0=&a[2]; split_iterator it1=&a[7]; a.erase(it0,it1); diff --git a/unit_tests/hashmap_unit_test/main.cu b/unit_tests/hashmap_unit_test/main.cu index 7d6f3d8..2da028a 100644 --- a/unit_tests/hashmap_unit_test/main.cu +++ b/unit_tests/hashmap_unit_test/main.cu @@ -1084,16 +1084,7 @@ TEST(HashmapUnitTets ,Test_Duplicate_Insertion){ } } -TEST(HashmapUnitTets ,EMPTY_TOMBSTONE_values){ - Hashmap hmap; - const auto tombstone = hmap.get_tombstone(); - const auto emptybucket = hmap.get_emptybucket(); - expect_true(tombstone==emptybucket); - //These can be also checked during compile time - static_assert(tombstone==emptybucket); -} - -int main(int argc, char* argv[]){ + main(int argc, char* argv[]){ srand(time(NULL)); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/unit_tests/umpire_tests/hashinator.cu b/unit_tests/umpire_tests/hashinator.cu new file mode 100644 index 0000000..ac5e396 --- /dev/null +++ b/unit_tests/umpire_tests/hashinator.cu @@ -0,0 +1,996 @@ +#include +#include +#include +#include +#include "../../include/hashinator/hashinator.h" +#include +#include +#include +#include +#include "include/hashinator/hash_pair.h" +#include "umpire/Allocator.hpp" +#include "umpire/ResourceManager.hpp" +#include "umpire/TypedAllocator.hpp" + + + + + +#define BLOCKSIZE 32 +#define expect_true EXPECT_TRUE +#define expect_false EXPECT_FALSE +#define expect_eq EXPECT_EQ +constexpr int MINPOWER = 10; +constexpr int MAXPOWER = 11; + + +using namespace std::chrono; +using namespace Hashinator; +typedef uint32_t val_type; +typedef uint32_t key_type; + + +static umpire::TypedAllocator* vector_key_alloc; +static umpire::TypedAllocator>* vector_map_alloc; +typedef split::SplitVector,umpire::TypedAllocator>> vector ; +typedef split::SplitVector> ivector ; + + + +// typedef split::SplitVector> vector ; +// typedef split::SplitVector ivector ; +typedef Hashmap>> hashmap; + + +struct Predicate{ + HASHINATOR_HOSTDEVICE + inline bool operator()( hash_pair& element)const{ + return element.second%2==0; + } +}; + +template +auto execute_and_time(const char* name,Fn fn, Args && ... args) ->bool{ + std::chrono::time_point start,stop; + double total_time=0; + start = std::chrono::high_resolution_clock::now(); + bool retval=fn(args...); + stop = std::chrono::high_resolution_clock::now(); + auto duration = duration_cast(stop- start).count(); + total_time+=duration; + //std::cout<& kval=src.at(i); + kval.first=i + bias; + kval.second=i; + } +} + + +void cpu_write(hashmap& hmap, vector& src){ + for (size_t i=0; i& kval=src.at(i); + hmap.at(kval.first)=kval.second; + } +} + +__global__ +void gpu_write(hashmap* hmap, hash_pair*src, size_t N){ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < N ){ + hmap->set_element(src[index].first, src[index].second); + } +} + + +__global__ +void gpu_remove_insert(hashmap* hmap, hash_pair*rm, hash_pair*add, size_t N){ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index ==0 ){ + for ( int i =0; i elem=rm[i]; + auto rmval=hmap->read_element(elem.first); + hmap->device_erase(elem.first); + } + for ( int i =0; i elem=add[i]; + hmap->set_element(elem.first,elem.second); + } + } +} + + +__global__ +void gpu_delete_even(hashmap* hmap, hash_pair*src,size_t N){ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (indexdevice_find(src[index].first); + if (kpos==hmap->device_end()){assert(0 && "Catastrophic crash in deletion");} + if (kpos->second %2==0 ){ + int retval=hmap->device_erase(kpos->first); + assert(retval==1 && "Failed to erase!"); + retval=hmap->device_erase(kpos->first); + assert(retval==0 && "Failed to not erase!"); + + } + } + return; +} + + + +__global__ +void gpu_recover_odd_elements(hashmap* hmap,hash_pair* src,size_t N ){ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < N ){ + key_type key= src[index].first; + val_type val= src[index].second; + if (val%2!=0){ + auto it=hmap->device_find(key); + if (it==hmap->device_end()){ + assert( 0 && "Failed in GPU RECOVER ALL "); + } + if (it->first!=key || it->second!=val){ + assert( 0 && "Failed in GPU RECOVER ALL "); + } + } + } + + //Iterate over all elements with 1 thread and check for evens; + if (index==0){ + for (auto it=hmap->device_begin(); it!=hmap->device_end(); ++it){ + if (it->second%2==0 ){ + printf("Found even when there should not be any!\n"); + assert(0); + } + } + } + return; +} + +bool recover_odd_elements(const hashmap& hmap, vector& src){ + for (size_t i=0; i& kval=src.at(i); + if (kval.second%2!=0){ + auto retval=hmap.find(kval.first); + if (retval==hmap.end()){return false;} + bool sane=retval->first==kval.first && retval->second== kval.second ; + if (!sane){ + return false; + } + } + } + return true; +} + +bool recover_all_elements(const hashmap& hmap, vector& src){ + for (size_t i=0; i& kval=src.at(i); + //std::cout<<"Validating "<first<<" "<second<& kval=src.at(i); + if (kval.second%2!=0){ + auto retval=hmap->find(kval.first); + if (retval==hmap->end()){return false;} + bool sane=retval->first==kval.first && retval->second== kval.second ; + if (!sane){ + return false; + } + } + } + return true; +} + +bool recover_all_elements(hashmap* hmap, vector& src){ + for (size_t i=0; i& kval=src.at(i); + auto retval=hmap->find(kval.first); + if (retval==hmap->end()){return false;} + bool sane=retval->first==kval.first && retval->second== kval.second ; + if (!sane){ + return false; + } + } + return true; +} + +__global__ +void gpu_recover_all_elements(hashmap* hmap,hash_pair* src,size_t N ){ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < N ){ + key_type key= src[index].first; + val_type val= src[index].second; + auto it=hmap->device_find(key); + if (it==hmap->device_end()){ + printf("END FOUND DEVICE\n"); + assert( 0 && "Failed in GPU RECOVER ALL "); + } + if (it->first!=key || it->second!=val){ + assert( 0 && "Failed in GPU RECOVER ALL "); + } + } + return; +} + +__global__ +void gpu_recover_warpWide(hashmap* hmap,hash_pair* src,size_t N ){ + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t wid = index / Hashinator::defaults::WARPSIZE; + const size_t w_tid = index % defaults::WARPSIZE; + if (wid < N ){ + key_type key= src[wid].first; + val_type retval;; + val_type val= src[wid].second; + hmap->warpFind(key,retval,w_tid); + assert(retval==val); + } +} + +__global__ +void gpu_recover_non_existant_key_warpWide(hashmap* hmap,hash_pair* src,size_t N ){ + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t wid = index / Hashinator::defaults::WARPSIZE; + const size_t w_tid = index % defaults::WARPSIZE; + if (wid < N ){ + val_type retval=42;; + key_type key=42 ; + hmap->warpFind(key,retval,w_tid); + assert(retval==42); + } +} + +__global__ +void gpu_write_warpWide(hashmap* hmap,hash_pair* src,size_t N ){ + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t wid = index / Hashinator::defaults::WARPSIZE; + const size_t w_tid = index % defaults::WARPSIZE; + if (wid < N ){ + key_type key= src[wid].first; + val_type val= src[wid].second; + hmap->warpInsert(key,val,w_tid); + } +} + +__global__ +void gpu_write_warpWide_UnorderedSet(hashmap* hmap,hash_pair* src,size_t N ){ + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t wid = index / Hashinator::defaults::WARPSIZE; + const size_t w_tid = index % defaults::WARPSIZE; + if (wid < N ){ + key_type key= src[wid].first; + val_type val= src[wid].second; + hmap->warpInsert<1>(key,val,w_tid); + } +} + +__global__ +void gpu_write_warpWide_Duplicate(hashmap* hmap,hash_pair* src,size_t N ){ + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t wid = index / Hashinator::defaults::WARPSIZE; + const size_t w_tid = index % defaults::WARPSIZE; + if (wid < N ){ + key_type key= src[0].first; + val_type val= src[0].second; + hmap->warpInsert(key,val,w_tid); + } +} + +__global__ +void gpu_erase_warpWide(hashmap* hmap,hash_pair* src,size_t N ){ + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t wid = index / Hashinator::defaults::WARPSIZE; + const size_t w_tid = index % defaults::WARPSIZE; + if (wid < N ){ + key_type key= src[wid].first; + hmap->warpErase(key,w_tid); + } +} + +__global__ +void gpu_write_warpWide_V(hashmap* hmap,hash_pair* src,size_t N ){ + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t wid = index / Hashinator::defaults::WARPSIZE; + const size_t w_tid = index % defaults::WARPSIZE; + if (wid < N ){ + key_type key= src[wid].first; + val_type val= src[wid].second; + auto retval=hmap->warpInsert_V(key,val,w_tid); + assert(retval); + } +} + +bool testWarpInsert(int power){ + size_t N = 1<resize(power+1); + + //Upload to device and insert input + gpu_write_warpWide<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + //Verify all elements + cpuOK=recover_all_elements(*hmap,src); + if (!cpuOK){ + return false; + } + + //duplicate test + { + size_t N = 1<resize(power+1); + //Upload to device and insert input + gpu_write_warpWide_Duplicate<<<1,1024>>>(hmap,src.data(),1); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + if (hmap->size()!=1){ + return false; + } + } + + return true; +} + +bool testWarpInsertUnorderedSet(int power){ + size_t N = 1<resize(power+1); + + //Upload to device and insert input + gpu_write_warpWide_UnorderedSet<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + //Verify all elements + cpuOK=recover_all_elements(*hmap,src); + if (!cpuOK){ + return false; + } + + + //Now we change the key values and increment them by 1 and we expect the same keys back because we are not supposed to overwrite + vector src2(src); + for (auto& i:src2){ + i.second++; + } + + //Upload to device and insert input + gpu_write_warpWide_UnorderedSet<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + //Verify all elements + cpuOK=recover_all_elements(*hmap,src); + if (!cpuOK){ + return false; + } + + //duplicate test + { + size_t N = 1<resize(power+1); + //Upload to device and insert input + gpu_write_warpWide_Duplicate<<<1,1024>>>(hmap,src.data(),1); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + if (hmap->size()!=1){ + return false; + } + } + + return true; +} + +bool testWarpInsert_V(int power){ + size_t N = 1<resize(power+1); + + //Upload to device and insert input + gpu_write_warpWide_V<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + //Verify all elements + cpuOK=recover_all_elements(*hmap,src); + if (!cpuOK){ + return false; + } + return true; +} + +bool testWarpErase(int power){ + size_t N = 1<resize(power+1); + + //Upload to device and insert input + gpu_write_warpWide_V<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + //Upload to device and insert input + gpu_erase_warpWide<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + if (hmap->size()!=0){ + return false; + } + + return true; +} + + +bool test_hashmap_1(int power){ + size_t N = 1<>>(d_hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + hmap.download(); + + //Verify all elements + cpuOK=recover_all_elements(hmap,src); + gpu_recover_all_elements<<>>(d_hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + return true; + if (!cpuOK){ + return false; + } + + //Delete some selection of the source data + d_hmap=hmap.upload(); + gpu_delete_even<<>>(d_hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + hmap.download(); + + //Quick check to verify there are no even elements + for (const auto& kval : hmap){ + if (kval.second%2==0){ + std::cout<>>(d_hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + if (!cpuOK){ + return false; + } + + //Reinsert so that we can also test duplicate insertion + d_hmap=hmap.upload(); + gpu_write<<>>(d_hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + //Download + hmap.download(); + + + //Verify all elements + cpuOK=recover_all_elements(hmap,src); + gpu_recover_all_elements<<>>(d_hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + if (!cpuOK ){ + return false; + } + + //If we made it to here we should be ok + return true; +} + + +bool test_hashmap_2(int power){ + size_t N = 1<resize(power+1); + + //Upload to device and insert input + gpu_write<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + //Verify all elements + cpuOK=recover_all_elements(hmap,src); + gpu_recover_all_elements<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + if (!cpuOK ){ + return false; + } + + //Delete some selection of the source data + gpu_delete_even<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + + //Upload to device and insert input + gpu_write<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + //Upload to device and insert input + gpu_write<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + + //Delete some selection of the source data + gpu_delete_even<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + //Quick check to verify there are no even elements + for (const auto& kval : *hmap){ + if (kval.second%2==0){ + std::cout<>>(hmap,src.data(),src.size()); + //SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + if (!cpuOK){ + return false; + } + + //Clean Tomstones and reinsert so that we can also test duplicate insertion + hmap->clean_tombstones(); + gpu_write<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + //Verify all elements + cpuOK=recover_all_elements(hmap,src); + gpu_recover_all_elements<<>>(hmap,src.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + if (!cpuOK ){ + return false; + } + + vector src2(N,*vector_map_alloc); + create_input(src2); + gpu_remove_insert<<<1,1>>>(hmap,src.data(),src2.data(),src.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + gpu_recover_all_elements<<>>(hmap,src2.data(),src2.size()); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + delete hmap; + hmap=nullptr; + return true; +} + +bool test_hashmap_3(int power){ + size_t N = 1< start,stop; + start = std::chrono::high_resolution_clock::now(); + hmap.clear(); + stop = std::chrono::high_resolution_clock::now(); + auto duration = duration_cast(stop- start).count(); + //std::cout<<"Clear took "< start,stop; + start = std::chrono::high_resolution_clock::now(); + hmap.clear(targets::device); + stop = std::chrono::high_resolution_clock::now(); + auto duration = duration_cast(stop- start).count(); + //std::cout<<"Clear took "< start,stop; + start = std::chrono::high_resolution_clock::now(); + hmap.resize(sz+2); + stop = std::chrono::high_resolution_clock::now(); + auto duration = duration_cast(stop- start).count(); + //std::cout<<"Resize took "< start,stop; + start = std::chrono::high_resolution_clock::now(); + hmap.resize(sz+2,targets::device); + stop = std::chrono::high_resolution_clock::now(); + auto duration = duration_cast(stop- start).count(); + //std::cout<<"Resize took "< +struct Rule{ + Rule(){} + __host__ __device__ + inline bool operator()( hash_pair& element)const{ + return element.first<1000; + } +}; + + +TEST(HashmapUnitTets ,Test_ErrorCodes_ExtractKeysByPattern){ + const int sz=5; + vector src(1<()); + for (auto i:out){ + expect_true(i<1000); + } +} + +TEST(HashmapUnitTets ,Test_ErrorCodes_ExtractKeysByPatternNoAllocations){ + const int sz=5; + vector src(1<()); + + void* buffer=nullptr; + size_t mem=2*sizeof(key_type)*(1<(),buffer,mem); + + expect_true(out1==out2); + SPLIT_CHECK_ERR (split_gpuFree(buffer)); +} + + +TEST(HashmapUnitTets ,Test_Copy_Metadata){ + const int sz=18; + vector src(1< generateUniqueRandomKeys(size_t size, size_t range=std::numeric_limits::max()) { + std::vector elements; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dist(1, range); + + for (size_t i = 0; i < size; ++i) { + key_type randomNum = i;//dist(gen); + if (std::find(elements.begin(), elements.end(), randomNum) == elements.end()) { + elements.push_back(randomNum); + } else { + --i; + } + } + return elements; +} + +void insertDuplicates(std::vector& vec, key_type element, size_t count) { + if (count>0){ + vec.insert(vec.end(), count, element); + } + srand(time(NULL)); + std::random_shuffle(vec.begin(),vec.end()); +} + +TEST(HashmapUnitTets ,Test_Duplicate_Insertion){ + const int sz=10; + for (size_t duplicates=2; duplicates<=(1< keys=generateUniqueRandomKeys(1<(alloc); + auto v2=umpire::TypedAllocator>(alloc); + vector_key_alloc=&v1; + vector_map_alloc=&v2; + srand(time(NULL)); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/unit_tests/umpire_tests/vec_device.cu b/unit_tests/umpire_tests/vec_device.cu new file mode 100644 index 0000000..bdaeda4 --- /dev/null +++ b/unit_tests/umpire_tests/vec_device.cu @@ -0,0 +1,294 @@ +#include +#include +#include +#include +#include "../../include/splitvector/splitvec.h" +#include "../../include/splitvector/split_tools.h" +#include "umpire/Allocator.hpp" +#include "umpire/ResourceManager.hpp" +#include "umpire/TypedAllocator.hpp" + + +#define expect_true EXPECT_TRUE +#define expect_false EXPECT_FALSE +#define expect_eq EXPECT_EQ +#define N 1<<12 + +static umpire::TypedAllocator* vector_alloc; +typedef split::SplitVector> vec ; +// typedef split::SplitVector> vec ; + + +__global__ +void add_vectors(vec* a , vec* b,vec* c){ + + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index< a->size()){ + c->at(index)=a->at(index)+b->at(index); + } + +} + + +__global__ +void resize_vector(vec* a , int size){ + a->device_resize(size); +} + + +__global__ +void push_back_kernel(vec* a){ + + int index = blockIdx.x * blockDim.x + threadIdx.x; + a->device_push_back(index); +} + +__global__ +void merge_kernel(vec* a,vec *b ){ + + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index==0){ + a->device_insert(a->end(),b->begin(),b->end()); + } +} + +__global__ +void merge_kernel_2(vec* a){ + + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index==0){ + a->device_insert(a->begin()++,3,42); + } +} + +__global__ +void erase_kernel(vec* a){ + auto it=a->begin(); + a->erase(it); + +} + + + +void print_vec_elements(const vec& v){ + std::cout<<"****Vector Contents********"<>>(d_a,d_b,d_c); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + for (const auto& e:c){ + expect_true(e==3); + } + + +} + +TEST(Constructors,Default){ + vec a(*vector_alloc); + expect_true(a.size()==0 && a.capacity()==0); + expect_true(a.data()==nullptr); +} + +TEST(Constructors,Size_based){ + vec a(N,*vector_alloc); + expect_true(a.size()==N && a.capacity()==N); + expect_true(a.data()!=nullptr); +} + + +TEST(Constructors,Specific_Value){ + vec a(N,5,*vector_alloc); + expect_true(a.size()==N && a.capacity()==N); + for (size_t i=0; i>>(d_a); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + vec* d_b=a.upload(); + erase_kernel<<<1,1>>>(d_b); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); +} + + + +TEST(Vector_Functionality , Resizing_Device){ + + { + vec a(32,42,*vector_alloc); + expect_true(a.size()==a.capacity()); + a.resize(16); + expect_true(a.size()==16); + expect_true(a.capacity()==32); + } + + { + vec a(32,42,*vector_alloc); + expect_true(a.size()==a.capacity()); + vec* d_a=a.upload(); + resize_vector<<<1,1>>>(d_a,16); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + expect_true(a.size()==16); + expect_true(a.capacity()==32); + } + + + { + vec a(32,42,*vector_alloc); + expect_true(a.size()==a.capacity()); + a.reserve(100); + expect_true(a.capacity()>100); + vec* d_a=a.upload(); + resize_vector<<<1,1>>>(d_a,64); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + expect_true(a.size()==64); + expect_true(a.capacity()>100); + for (size_t i = 0 ; i< a.size(); ++i){ + a.at(i)=3; + expect_true(a.at(i)=3); + } + } +} + +TEST(Vector_Functionality , Test_CopyMetaData){ + + vec a(32,42,*vector_alloc); + expect_true(a.size()==a.capacity()); + a.resize(16); + expect_true(a.size()==16); + expect_true(a.capacity()==32); + split::SplitInfo* info; + SPLIT_CHECK_ERR( split_gpuMallocHost((void **) &info, sizeof(split::SplitInfo)) ); + a.copyMetadata(info); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + expect_true(a.capacity()==info->capacity); + expect_true(a.size()==info->size); +} + + +int main(int argc, char* argv[]){ + auto& rm = umpire::ResourceManager::getInstance(); + umpire::Allocator alloc = rm.getAllocator("MANAGED"); + auto va=umpire::TypedAllocator(alloc); + vector_alloc=&va; + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/unit_tests/umpire_tests/vec_host.cu b/unit_tests/umpire_tests/vec_host.cu new file mode 100644 index 0000000..6c8ce89 --- /dev/null +++ b/unit_tests/umpire_tests/vec_host.cu @@ -0,0 +1,176 @@ +#include +#include +#include +#include +#include +#ifndef SPLIT_CPU_ONLY_MODE +#define SPLIT_CPU_ONLY_MODE +#endif +#include "../../include/splitvector/splitvec.h" +#include "umpire/Allocator.hpp" +#include "umpire/ResourceManager.hpp" +#include "umpire/TypedAllocator.hpp" + + +#define expect_true EXPECT_TRUE +#define expect_false EXPECT_FALSE +#define expect_eq EXPECT_EQ +#define N 1<<12 + + + +static umpire::TypedAllocator* vector_alloc; +typedef split::SplitVector> vec ; +typedef std::vector> stdvec ; +typedef split::SplitVector>::iterator split_iterator; + + + +template +void print_vec_elements(VECTOR& v){ + std::cout<<"****Vector Contents********"<(alloc); + vector_alloc=&va; + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +