diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000..9409415
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,20 @@
+# YAML 1.2
+# Metadata for citation of this software according to the CFF format (https://citation-file-format.github.io/)
+cff-version: 1.0.3
+message: If you use this software, please cite it using these metadata.
+title: 'Hashinator: A hybrid hashmap designed for heterogeneous computing.'
+doi: 10.5281/zenodo.11396297
+authors:
+- given-names: Kostis
+  family-names: Papadakis
+  affiliation: University of Helsinki
+- given-names: Markus
+  family-names: Battarbee
+  affiliation: University of Helsinki
+- given-names: René
+  family-names: Widera
+  affiliation:
+version: 1.0.1
+date-released: 2024-05-30
+repository-code: https://github.com/fmihpc/hashinator
+license: cc-by-4.0
diff --git a/README.md b/README.md
index fa69fab..60c3351 100644
--- a/README.md
+++ b/README.md
@@ -27,8 +27,11 @@ No installation required. Just include  "hashinator.h" . However, if you plan
 
 ## Run the tests   
 ```
-meson wrap install gtest # skip if you have gtest installed system wide and pkg-config can find it. 
-meson setup build  --buildtype=release 
+cd hashinator   
+mkdir subprojects
+meson wrap install gtest 
+meson setup build  --buildtype=release
+meson compile -C build --jobs=8
 meson test -C build
 ```
 
@@ -150,6 +153,45 @@ int main()
 	hmap.insert(src.data(),src.size());
 }
 ```
+### Hashinator and SplitVector can also be used with external allocators   
+
+```c++
+#include "splitvec.h"
+//main.cu
+using vector = split::SplitVector<int,std::allocator<int>>;
+
+
+int main()
+{
+	vector* vec = new vector{1,2,3,4,5};
+	vec->reserve(128);
+	std::cout<<vec[3]<<std::endl;
+	delete vec;
+}
+```
+
+```c++
+//main.cpp
+#include "hashinator.h"
+
+int main()
+{
+
+   Hashmap<uint32_t,uint32_t,std::allocator<uint32_t>> hmap;
+
+   //Write
+   for (uint32_t i=0 ; i<64; ++i){
+      hmap[i]=rand()%10000;
+   }
+
+   //Read
+   for (const auto& i:hmap){
+      std::cout<<"["<<i.first<<" "<<i.second<<"] ";
+   }
+   std::cout<<std::endl;
+}
+```
+
 `nvcc   main.cu  -std=c++17  --expt-relaxed-constexpr --expt-extended-lambda -gencode arch=compute_80,code=sm_80 -o example`
 
 You can have a look in the Doxygen for a more feature-rich explanation of the methods and tools included!   
diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index e2a37e4..ac22f00 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -45,31 +45,41 @@ namespace Hashinator {
 
 #ifndef HASHINATOR_CPU_ONLY_MODE
 template <typename T>
-using DefaultMetaAllocator = split::split_unified_allocator<T>;
+using DefaultAllocator = split::split_unified_allocator<T>;
 #define DefaultHasher                                                                                                  \
    Hashers::Hasher<KEY_TYPE, VAL_TYPE, HashFunction, EMPTYBUCKET, TOMBSTONE, defaults::WARPSIZE,                       \
                    defaults::elementsPerWarp>
 #else
 template <typename T>
-using DefaultMetaAllocator = split::split_host_allocator<T>;
+using DefaultAllocator = split::split_host_allocator<T>;
 #define DefaultHasher void
 #endif
 
 using MapInfo = Hashinator::Info;
-template <typename KEY_TYPE, typename VAL_TYPE, KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max(),
-          KEY_TYPE TOMBSTONE = EMPTYBUCKET - 1, class HashFunction = HashFunctions::Fibonacci<KEY_TYPE>,
-          class DeviceHasher = DefaultHasher, class Meta_Allocator = DefaultMetaAllocator<MapInfo>>
+template <typename KEY_TYPE, typename VAL_TYPE, class Allocator = DefaultAllocator<hash_pair<KEY_TYPE,VAL_TYPE>>,
+          KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max(),KEY_TYPE TOMBSTONE = EMPTYBUCKET - 1, 
+          class HashFunction = HashFunctions::Fibonacci<KEY_TYPE>,class DeviceHasher = DefaultHasher>
 class Hashmap {
 
 private:
+   
+   constexpr size_t get_number_of_Ts_for_Map_Info()const noexcept{
+      constexpr size_t size_of_T=sizeof(hash_pair<KEY_TYPE,VAL_TYPE>);
+      constexpr size_t size_of_info=sizeof(MapInfo);
+      if constexpr (size_of_T>size_of_info){
+         return 1;
+      }
+      return std::ceil(size_of_info/size_of_T);
+   }
+   
    // CUDA device handle
    Hashmap* device_map;
-   split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>* device_buckets;
+   split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>* device_buckets;
    //~CUDA device handle
 
    // Host members
-   split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>> buckets;
-   Meta_Allocator _metaAllocator; // Allocator used to allocate and deallocate memory for metadata
+   split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator> buckets;
+   Allocator _allocator; // Allocator used to allocate and deallocate memory for metadata
    MapInfo* _mapInfo;
    //~Host members
 
@@ -85,8 +95,7 @@ class Hashmap {
    void preallocate_device_handles() {
 #ifndef HASHINATOR_CPU_ONLY_MODE
       SPLIT_CHECK_ERR(split_gpuMalloc((void**)&device_map, sizeof(Hashmap)));
-      device_buckets = reinterpret_cast<split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>*>(
-          reinterpret_cast<char*>(device_map) + offsetof(Hashmap, buckets));
+      device_buckets = &device_map->buckets;
 #endif
    }
 
@@ -106,39 +115,62 @@ class Hashmap {
    inline void set_status(status code) noexcept { _mapInfo->err = code; }
 
 public:
-   Hashmap() {
+   //By default we allocate enough space for 1<<5 elements
+   Hashmap():_allocator(Allocator{}) {
+      preallocate_device_handles();
+      _mapInfo = reinterpret_cast<MapInfo*>(_allocator.allocate(get_number_of_Ts_for_Map_Info()));
+      *_mapInfo = MapInfo(5);
+      buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>(
+          1 << _mapInfo->sizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()),_allocator);
+#ifndef HASHINATOR_CPU_ONLY_MODE
+      SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
+#endif
+   };
+   
+   Hashmap(const Allocator& allocator):buckets(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>(
+          1 << 5, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()),_allocator)),_allocator(allocator){
       preallocate_device_handles();
-      _mapInfo = _metaAllocator.allocate(1);
+      _mapInfo = reinterpret_cast<MapInfo*>(_allocator.allocate(get_number_of_Ts_for_Map_Info()));
       *_mapInfo = MapInfo(5);
-      buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
-          1 << _mapInfo->sizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()));
 #ifndef HASHINATOR_CPU_ONLY_MODE
       SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
 #endif
    };
 
-   Hashmap(int sizepower) {
+   Hashmap(int sizepower):_allocator(Allocator{})  {
+      preallocate_device_handles();
+      _mapInfo = reinterpret_cast<MapInfo*>(_allocator.allocate(get_number_of_Ts_for_Map_Info()));
+      *_mapInfo = MapInfo(sizepower);
+      buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>(
+          1 << _mapInfo->sizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()),_allocator);
+      #ifndef HASHINATOR_CPU_ONLY_MODE
+      SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
+#endif
+   };
+   
+   Hashmap(int sizepower,const Allocator& allocator):buckets(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>(
+          1 << sizepower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()),allocator)),_allocator(allocator){
       preallocate_device_handles();
-      _mapInfo = _metaAllocator.allocate(1);
+      _mapInfo = reinterpret_cast<MapInfo*>(_allocator.allocate(get_number_of_Ts_for_Map_Info()));
       *_mapInfo = MapInfo(sizepower);
-      buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
-          1 << _mapInfo->sizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()));
 #ifndef HASHINATOR_CPU_ONLY_MODE
       SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
 #endif
    };
 
-   Hashmap(const Hashmap<KEY_TYPE, VAL_TYPE>& other) {
+   Hashmap(const Hashmap<KEY_TYPE, VAL_TYPE,Allocator>& other) {
+      _allocator=other._allocator;
       preallocate_device_handles();
-      _mapInfo = _metaAllocator.allocate(1);
+      _mapInfo = reinterpret_cast<MapInfo*>(_allocator.allocate(get_number_of_Ts_for_Map_Info()));
       *_mapInfo = *(other._mapInfo);
       buckets = other.buckets;
 #ifndef HASHINATOR_CPU_ONLY_MODE
       SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
 #endif
    };
-
-   Hashmap(Hashmap<KEY_TYPE, VAL_TYPE>&& other) {
+   
+   Hashmap(Hashmap<KEY_TYPE, VAL_TYPE,Allocator>&& other) {
+      _allocator=other._allocator;
       preallocate_device_handles();
       _mapInfo = other._mapInfo;
       other._mapInfo = nullptr;
@@ -148,7 +180,8 @@ class Hashmap {
 #endif
    };
 
-   Hashmap& operator=(const Hashmap<KEY_TYPE, VAL_TYPE>& other) {
+   //Allocator is not replaces with assignment operation
+   Hashmap& operator=(const Hashmap<KEY_TYPE, VAL_TYPE,Allocator>& other) {
       if (this == &other) {
          return *this;
       }
@@ -162,7 +195,7 @@ class Hashmap {
 
 #ifndef HASHINATOR_CPU_ONLY_MODE
    /** Copy assign but using a provided stream */
-   void overwrite(const Hashmap<KEY_TYPE, VAL_TYPE>& other, split_gpuStream_t stream = 0) {
+   void overwrite(const Hashmap<KEY_TYPE, VAL_TYPE,Allocator>& other, split_gpuStream_t stream = 0) {
       if (this == &other) {
          return;
       }
@@ -174,11 +207,11 @@ class Hashmap {
    }
 #endif
 
-   Hashmap& operator=(Hashmap<KEY_TYPE, VAL_TYPE>&& other) {
+   Hashmap& operator=(Hashmap<KEY_TYPE, VAL_TYPE,Allocator>&& other) {
       if (this == &other) {
          return *this;
       }
-      _metaAllocator.deallocate(_mapInfo, 1);
+      _allocator.deallocate(reinterpret_cast<hash_pair<KEY_TYPE,VAL_TYPE>*>(_mapInfo), get_number_of_Ts_for_Map_Info());
       _mapInfo = other._mapInfo;
       other._mapInfo = nullptr;
       buckets = std::move(other.buckets);
@@ -190,7 +223,7 @@ class Hashmap {
 
    ~Hashmap() {
       deallocate_device_handles();
-      _metaAllocator.deallocate(_mapInfo, 1);
+      _allocator.deallocate(reinterpret_cast<hash_pair<KEY_TYPE,VAL_TYPE>*>(_mapInfo), get_number_of_Ts_for_Map_Info());
    };
 
 #ifdef HASHINATOR_CPU_ONLY_MODE
@@ -237,8 +270,8 @@ class Hashmap {
       if (newSizePower > 32) {
          throw std::out_of_range("Hashmap ran into rehashing catastrophe and exceeded 32bit buckets.");
       }
-      split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>> newBuckets(
-          1 << newSizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()));
+      split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator> newBuckets(
+          1 << newSizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()),_allocator);
       _mapInfo->sizePower = newSizePower;
       int bitMask = (1 << _mapInfo->sizePower) - 1; // For efficient modulo of the array size
 
@@ -322,8 +355,8 @@ class Hashmap {
          // DeviceHasher::reset_all(buckets.data(),_mapInfo, buckets.size(), s);
       } else {
          // Need new buckets
-         buckets = std::move(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
-             1 << newSizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE())));
+         buckets = std::move(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>(
+             1 << newSizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()),_allocator));
          SPLIT_CHECK_ERR(split_gpuMemcpyAsync(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice, s));
          optimizeGPU(s);
       }
@@ -483,7 +516,7 @@ class Hashmap {
 
 #ifdef HASHINATOR_CPU_ONLY_MODE
    void clear() {
-      buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(1 << _mapInfo->sizePower, {EMPTYBUCKET, VAL_TYPE()});
+      buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>(1 << _mapInfo->sizePower, {EMPTYBUCKET, VAL_TYPE()},_allocator); 
       *_mapInfo = MapInfo(_mapInfo->sizePower);
       return;
    }
@@ -493,7 +526,7 @@ class Hashmap {
       switch (t) {
       case targets::host:
          buckets =
-             split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(1 << _mapInfo->sizePower, {EMPTYBUCKET, VAL_TYPE()});
+             split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>(1 << _mapInfo->sizePower, {EMPTYBUCKET, VAL_TYPE()},_allocator);
          *_mapInfo = MapInfo(_mapInfo->sizePower);
          break;
 
@@ -671,11 +704,11 @@ class Hashmap {
 
    // Iterator type. Iterates through all non-empty buckets.
    class iterator {
-      Hashmap<KEY_TYPE, VAL_TYPE>* hashtable;
+      Hashmap<KEY_TYPE, VAL_TYPE,Allocator>* hashtable;
       size_t index;
 
    public:
-      iterator(Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index) : hashtable(&hashtable), index(index) {}
+      iterator(Hashmap<KEY_TYPE, VAL_TYPE,Allocator>& hashtable, size_t index) : hashtable(&hashtable), index(index) {}
 
       iterator& operator++() {
          index++;
@@ -706,11 +739,11 @@ class Hashmap {
 
    // Const iterator.
    class const_iterator {
-      const Hashmap<KEY_TYPE, VAL_TYPE>* hashtable;
+      const Hashmap<KEY_TYPE, VAL_TYPE,Allocator>* hashtable;
       size_t index;
 
    public:
-      explicit const_iterator(const Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index)
+      explicit const_iterator(const Hashmap<KEY_TYPE, VAL_TYPE,Allocator>& hashtable, size_t index)
           : hashtable(&hashtable), index(index) {}
       const_iterator& operator++() {
          index++;
@@ -1152,7 +1185,7 @@ class Hashmap {
     *   hmap.extractPattern(elements,Rule<uint32_t,uint32_t>());
     * */
    template <bool prefetches = true, typename Rule>
-   size_t extractPattern(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, Rule rule,
+   size_t extractPattern(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>& elements, Rule rule,
                          split_gpuStream_t s = 0) {
       elements.resize(_mapInfo->fill + 1, true);
       if constexpr (prefetches) {
@@ -1179,18 +1212,18 @@ class Hashmap {
       const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
       split::tools::splitStackArena mPool(memory_for_pool, s);
       size_t retval =
-          split::tools::copy_if_raw<hash_pair<KEY_TYPE, VAL_TYPE>, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(
+          split::tools::copy_if_raw<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(
               buckets, elements, rule, nBlocks, mPool, s);
       return retval;
    }
    template <typename Rule>
-   void extractPatternLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, Rule rule,
+   void extractPatternLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>& elements, Rule rule,
                            split_gpuStream_t s = 0) {
       // Extract elements matching the Pattern Rule(element)==true;
       split::tools::copy_if_loop<hash_pair<KEY_TYPE, VAL_TYPE>, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(
           *device_buckets, elements, rule, s);
    }
-   void extractLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, split_gpuStream_t s = 0) {
+   void extractLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>,Allocator>& elements, split_gpuStream_t s = 0) {
       // Extract all valid elements
       auto rule = [] __host__ __device__(const hash_pair<KEY_TYPE, VAL_TYPE>& kval) -> bool {
          return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE;
@@ -1214,6 +1247,24 @@ class Hashmap {
       }
       return elements.size();
    }
+   
+   template <typename KeyAlloc,bool prefetches = true, typename Rule>
+   size_t extractKeysByPattern(split::SplitVector<KEY_TYPE,KeyAlloc>& elements, Rule rule, split_gpuStream_t s = 0) {
+      elements.resize(_mapInfo->fill + 1, true);
+      if constexpr (prefetches) {
+         elements.optimizeGPU(s);
+      }
+      // Extract element **keys** matching the Pattern Rule(element)==true;
+      split::tools::copy_keys_if<hash_pair<KEY_TYPE, VAL_TYPE>, KEY_TYPE,Allocator,KeyAlloc, Rule,defaults::MAX_BLOCKSIZE,
+                                 defaults::WARPSIZE>(buckets, elements, rule, s);
+      // FIXME: there is an issue where paging to host occurs and following calls to hashmap operations take a hit.
+      // temp fix: call optimizeGPU() here
+      if constexpr (prefetches) {
+         optimizeGPU(s);
+      }
+      return elements.size();
+   }
+   
    template <bool prefetches = true, typename Rule>
    size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void* stack, size_t max_size,
                                split_gpuStream_t s = 0) {
@@ -1226,6 +1277,20 @@ class Hashmap {
                                  defaults::WARPSIZE>(buckets, elements, rule, stack, max_size, s);
       return elements.size();
    }
+   
+   template <typename KeyAlloc, bool prefetches = true, typename Rule>
+   size_t extractKeysByPattern(split::SplitVector<KEY_TYPE,KeyAlloc>& elements, Rule rule, void* stack, size_t max_size,
+                               split_gpuStream_t s = 0) {
+      elements.resize(_mapInfo->fill + 1, true);
+      if constexpr (prefetches) {
+         elements.optimizeGPU(s);
+      }
+      // Extract element **keys** matching the Pattern Rule(element)==true;
+      split::tools::copy_keys_if<hash_pair<KEY_TYPE, VAL_TYPE>, KEY_TYPE,Allocator,KeyAlloc, Rule, defaults::MAX_BLOCKSIZE,
+                                 defaults::WARPSIZE>(buckets, elements, rule, stack, max_size, s);
+      return elements.size();
+   }
+   
    template <typename Rule>
    void extractKeysByPatternLoop(split::SplitVector<KEY_TYPE>& elements, Rule rule, split_gpuStream_t s = 0) {
       // Extract element **keys** matching the Pattern Rule(element)==true;
@@ -1470,11 +1535,11 @@ class Hashmap {
    class device_iterator {
    private:
       size_t index;
-      Hashmap<KEY_TYPE, VAL_TYPE>* hashtable;
+      Hashmap<KEY_TYPE, VAL_TYPE,Allocator>* hashtable;
 
    public:
       HASHINATOR_DEVICEONLY
-      device_iterator(Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index) : index(index), hashtable(&hashtable) {}
+      device_iterator(Hashmap<KEY_TYPE, VAL_TYPE,Allocator>& hashtable, size_t index) : index(index), hashtable(&hashtable) {}
 
       HASHINATOR_DEVICEONLY
       size_t getIndex() { return index; }
@@ -1747,27 +1812,22 @@ class Hashmap {
    }
 
    HASHINATOR_DEVICEONLY
-   const VAL_TYPE& read_element(const KEY_TYPE& key) const {
-      int bitMask = (1 << _mapInfo->sizePower) - 1; // For efficient modulo of the array size
-      auto hashIndex = hash(key);
-
-      // Try to find the matching bucket.
-      const size_t bsize = buckets.size();
-      for (size_t i = 0; i < bsize; i++) {
-         uint32_t vecindex = (hashIndex + i) & bitMask;
-         const hash_pair<KEY_TYPE, VAL_TYPE>& candidate = buckets[vecindex];
-         if (candidate.first == key) {
-            // Found a match, return that
-            return candidate.second;
-         }
-         if (candidate.first == EMPTYBUCKET) {
-            // Found an empty bucket, so error.
-            assert(false && "Key does not exist");
-         }
-      }
-      assert(false && "Key does not exist");
-   }
-
+   std::optional<std::reference_wrapper<const VAL_TYPE>> read_element(const KEY_TYPE& key) const {
+   int bitMask = (1 << _mapInfo->sizePower) - 1;
+   auto hashIndex = hash(key);
+   const size_t bsize = buckets.size();
+   for (size_t i = 0; i < bsize; i++) {
+       uint32_t vecindex = (hashIndex + i) & bitMask;
+       const hash_pair<KEY_TYPE, VAL_TYPE>& candidate = buckets[vecindex];
+       if (candidate.first == key) {
+           return candidate.second; 
+       }
+       if (candidate.first == EMPTYBUCKET) {
+           return std::nullopt; 
+       }
+   }
+   return std::nullopt;
+}
 #else
 
    // Uses Hasher's insert_kernel to insert all elements
diff --git a/include/splitvector/split_allocators.h b/include/splitvector/split_allocators.h
index 2527400..e905b1b 100644
--- a/include/splitvector/split_allocators.h
+++ b/include/splitvector/split_allocators.h
@@ -85,7 +85,7 @@ class split_unified_allocator {
    pointer address(reference x) const { return &x; }
    const_pointer address(const_reference x) const { return &x; }
 
-   pointer allocate(size_type n, const void* /*hint*/ = 0) {
+   static pointer allocate(size_type n, const void* /*hint*/ = 0) {
       T* ret;
       assert(n && "allocate 0");
       SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, n * sizeof(value_type)));
@@ -95,21 +95,7 @@ class split_unified_allocator {
       return ret;
    }
 
-   static void* allocate_raw(size_type n, const void* /*hint*/ = 0) {
-      void* ret;
-      SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, n));
-      if (ret == nullptr) {
-         throw std::bad_alloc();
-      }
-      return ret;
-   }
-
-   void deallocate(pointer p, size_type n) {
-      if (n != 0 && p != 0) {
-         SPLIT_CHECK_ERR(split_gpuFree(p));
-      }
-   }
-   static void deallocate(void* p, size_type n) {
+   static void deallocate(pointer p, size_type n) {
       if (n != 0 && p != 0) {
          SPLIT_CHECK_ERR(split_gpuFree(p));
       }
@@ -129,74 +115,4 @@ class split_unified_allocator {
 };
 
 #endif
-
-/**
- * @brief Custom allocator for host memory.
- *
- * This class provides an allocator for host memory, which can be accessed
- * by the CPU. It allocates and deallocates memory using malloc and free functions,
- * while also providing constructors and destructors for objects.
- *
- * @tparam T Type of the allocated objects.
- */
-template <class T>
-class split_host_allocator {
-public:
-   typedef T value_type;
-   typedef value_type* pointer;
-   typedef const value_type* const_pointer;
-   typedef value_type& reference;
-   typedef const value_type& const_reference;
-   typedef ptrdiff_t difference_type;
-   typedef size_t size_type;
-   template <class U>
-   struct rebind {
-      typedef split_host_allocator<U> other;
-   };
-
-   /**
-    * @brief Default constructor.
-    */
-   split_host_allocator() throw() {}
-
-   /**
-    * @brief Copy constructor with different type.
-    */
-   template <class U>
-   split_host_allocator(split_host_allocator<U> const&) throw() {}
-   pointer address(reference x) const { return &x; }
-   const_pointer address(const_reference x) const { return &x; }
-
-   pointer allocate(size_type n, const void* /*hint*/ = 0) {
-      pointer const ret = reinterpret_cast<pointer>(malloc(n * sizeof(value_type)));
-      if (ret == nullptr) {
-         throw std::bad_alloc();
-      }
-      return ret;
-   }
-
-   static void* allocate_raw(size_type n, const void* /*hint*/ = 0) {
-      void* ret = (void*)malloc(n);
-      if (ret == nullptr) {
-         throw std::bad_alloc();
-      }
-      return ret;
-   }
-
-   void deallocate(pointer p, size_type) { free(p); }
-
-   static void deallocate(void* p, size_type) { free(p); }
-
-   size_type max_size() const throw() {
-      size_type max = static_cast<size_type>(-1) / sizeof(value_type);
-      return (max > 0 ? max : 1);
-   }
-
-   template <typename U, typename... Args>
-   void construct(U* p, Args&&... args) {
-      ::new (p) U(std::forward<Args>(args)...);
-   }
-
-   void destroy(pointer p) { p->~value_type(); }
-};
 } // namespace split
diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index 045a624..f01e245 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -101,8 +101,8 @@ __global__ void scan_add(T* input, T* partial_sums, size_t blockSize, size_t len
  * @param rule The rule functor object.
  */
 template <typename T, typename Rule>
-__global__ void scan_reduce(split::SplitVector<T, split::split_unified_allocator<T>>* input,
-                            split::SplitVector<uint32_t, split::split_unified_allocator<uint32_t>>* output, Rule rule) {
+__global__ void scan_reduce(split::SplitVector<T>* input,
+                            split::SplitVector<uint32_t>* output, Rule rule) {
 
    size_t size = input->size();
    size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -212,10 +212,10 @@ __global__ void split_prescan(T* input, T* output, T* partial_sums, int n, size_
  * @param rule The rule functor object.
  */
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-__global__ void split_compact(split::SplitVector<T, split::split_unified_allocator<T>>* input,
-                              split::SplitVector<uint32_t, split::split_unified_allocator<uint32_t>>* counts,
-                              split::SplitVector<uint32_t, split::split_unified_allocator<uint32_t>>* offsets,
-                              split::SplitVector<T, split::split_unified_allocator<T>>* output, Rule rule) {
+__global__ void split_compact(split::SplitVector<T>* input,
+                              split::SplitVector<uint32_t>* counts,
+                              split::SplitVector<uint32_t>* offsets,
+                              split::SplitVector<T>* output, Rule rule) {
    extern __shared__ uint32_t buffer[];
    const size_t size = input->size();
    const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -259,10 +259,10 @@ __global__ void split_compact(split::SplitVector<T, split::split_unified_allocat
  * @brief Same as split_compact but only for hashinator keys.
  */
 template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-__global__ void split_compact_keys(split::SplitVector<T, split::split_unified_allocator<T>>* input,
-                                   split::SplitVector<uint32_t, split::split_unified_allocator<uint32_t>>* counts,
-                                   split::SplitVector<uint32_t, split::split_unified_allocator<uint32_t>>* offsets,
-                                   split::SplitVector<U, split::split_unified_allocator<U>>* output, Rule rule) {
+__global__ void split_compact_keys(split::SplitVector<T>* input,
+                                   split::SplitVector<uint32_t>* counts,
+                                   split::SplitVector<uint32_t>* offsets,
+                                   split::SplitVector<U>* output, Rule rule) {
    extern __shared__ uint32_t buffer[];
    const size_t size = input->size();
    const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -361,8 +361,8 @@ __global__ void split_compact_keys_raw(T* input, uint32_t* counts, uint32_t* off
  * @param s The split_gpuStream_t stream for GPU execution (default is 0).
  */
 template <typename T, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void split_prefix_scan(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-                       split::SplitVector<T, split::split_unified_allocator<T>>& output, split_gpuStream_t s = 0)
+void split_prefix_scan(split::SplitVector<T>& input,
+                       split::SplitVector<T>& output, split_gpuStream_t s = 0)
 
 {
    using vector = split::SplitVector<T, split::split_unified_allocator<T>>;
@@ -716,8 +716,8 @@ __global__ void block_compact_keys(T* input, U* output, size_t inputSize, Rule r
 }
 
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024>
-__global__ void loop_compact(split::SplitVector<T, split::split_unified_allocator<T>>& inputVec,
-                             split::SplitVector<T, split::split_unified_allocator<T>>& outputVec, Rule rule) {
+__global__ void loop_compact(split::SplitVector<T>& inputVec,
+                             split::SplitVector<T>& outputVec, Rule rule) {
    // This must be equal to at least both WARPLENGTH and MAX_BLOCKSIZE/WARPLENGTH
    __shared__ uint32_t warpSums[WARPLENGTH];
    __shared__ uint32_t outputCount;
@@ -918,7 +918,43 @@ size_t copy_if_keys_block(T* input, U* output, size_t size, Rule rule, splitStac
  * @brief Same as copy_if but using raw memory
  */
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& input, T* output, Rule rule,
+uint32_t copy_if_raw(split::SplitVector<T>& input, T* output, Rule rule,
+                     size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) {
+
+   size_t _size = input.size();
+   if (_size <= BLOCKSIZE) {
+      return copy_if_block(input.data(), output, _size, rule, mPool, s);
+   }
+   uint32_t* d_counts;
+   uint32_t* d_offsets;
+   d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
+   SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s));
+
+   // Phase 1 -- Calculate per warp workload
+   split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
+   d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
+   SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t), s));
+
+   // Step 2 -- Exclusive Prefix Scan on offsets
+   if (nBlocks == 1) {
+      split_prefix_scan_raw<uint32_t, 2, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
+   } else {
+      split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
+   }
+
+   // Step 3 -- Compaction
+   uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
+   split::tools::split_compact_raw<T, Rule, BLOCKSIZE, WARP>
+       <<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(input.data(), d_counts, d_offsets,
+                                                                                  output, rule, _size, nBlocks, retval);
+   uint32_t numel;
+   SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   return numel;
+}
+
+template <typename T,typename Allocator, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+uint32_t copy_if_raw(split::SplitVector<T,Allocator>& input, T* output, Rule rule,
                      size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) {
 
    size_t _size = input.size();
@@ -994,8 +1030,8 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, size_t nBlocks
  */
 
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void copy_if_loop(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-                  split::SplitVector<T, split::split_unified_allocator<T>>& output, Rule rule,
+void copy_if_loop(split::SplitVector<T>& input,
+                  split::SplitVector<T>& output, Rule rule,
                   split_gpuStream_t s = 0) {
 #ifdef HASHINATOR_DEBUG
    bool input_ok = isDeviceAccessible(reinterpret_cast<void*>(&input));
@@ -1007,8 +1043,8 @@ void copy_if_loop(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
 }
 
 template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void copy_if_keys_loop(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-                       split::SplitVector<U, split::split_unified_allocator<U>>& output, Rule rule,
+void copy_if_keys_loop(split::SplitVector<T>& input,
+                       split::SplitVector<U>& output, Rule rule,
                        split_gpuStream_t s = 0) {
 #ifdef HASHINATOR_DEBUG
    bool input_ok = isDeviceAccessible(reinterpret_cast<void*>(&input));
@@ -1023,7 +1059,43 @@ void copy_if_keys_loop(split::SplitVector<T, split::split_unified_allocator<T>>&
  * @brief Same as copy_keys_if but using raw memory
  */
 template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& input, U* output, Rule rule,
+size_t copy_keys_if_raw(split::SplitVector<T>& input, U* output, Rule rule,
+                        size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) {
+
+   size_t _size = input.size();
+   if (_size <= BLOCKSIZE) {
+      return copy_if_keys_block(input.data(), output, _size, rule, mPool, s);
+   }
+   uint32_t* d_counts;
+   uint32_t* d_offsets;
+   d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
+   SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s));
+
+   // Phase 1 -- Calculate per warp workload
+   split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
+   d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
+   SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t), s));
+
+   // Step 2 -- Exclusive Prefix Scan on offsets
+   if (nBlocks == 1) {
+      split_prefix_scan_raw<uint32_t, 2, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
+   } else {
+      split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
+   }
+
+   // Step 3 -- Compaction
+   uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
+   split::tools::split_compact_keys_raw<T, U, Rule, BLOCKSIZE, WARP>
+       <<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(input.data(), d_counts, d_offsets,
+                                                                                  output, rule, _size, nBlocks, retval);
+   uint32_t numel;
+   SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   return numel;
+}
+
+template <typename T, typename Allocator, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+size_t copy_keys_if_raw(split::SplitVector<T,Allocator>& input, U* output, Rule rule,
                         size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) {
 
    size_t _size = input.size();
@@ -1058,12 +1130,13 @@ size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>
    return numel;
 }
 
+
 /**
  * @brief Estimates memory needed for compacting the input splitvector
  */
 template <typename T, int BLOCKSIZE = 1024>
 [[nodiscard]] size_t
-estimateMemoryForCompaction(const split::SplitVector<T, split::split_unified_allocator<T>>& input) noexcept {
+estimateMemoryForCompaction(const split::SplitVector<T>& input) noexcept {
    // Figure out Blocks to use
    size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE);
    size_t nBlocks = nextPow2(_s);
@@ -1092,8 +1165,27 @@ template <int BLOCKSIZE = 1024>
  * @brief Same as copy_if but only for Hashinator keys
  */
 template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-                  split::SplitVector<U, split::split_unified_allocator<U>>& output, Rule rule,
+void copy_keys_if(split::SplitVector<T>& input,
+                  split::SplitVector<U>& output, Rule rule,
+                  split_gpuStream_t s = 0) {
+
+   // Figure out Blocks to use
+   size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE);
+   size_t nBlocks = nextPow2(_s);
+   if (nBlocks == 0) {
+      nBlocks += 1;
+   }
+
+   // Allocate with Mempool
+   const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
+   splitStackArena mPool(memory_for_pool, s);
+   auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   output.erase(&output[len], output.end());
+}
+
+template <typename T, typename U,typename MapAlloc,typename KeyAlloc, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+void copy_keys_if(split::SplitVector<T,MapAlloc>& input,
+                  split::SplitVector<U,KeyAlloc>& output, Rule rule,
                   split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
@@ -1126,8 +1218,8 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
  * @param s The split_gpuStream_t stream for GPU execution (default is 0).
  */
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-             split::SplitVector<T, split::split_unified_allocator<T>>& output, Rule rule, split_gpuStream_t s = 0) {
+void copy_if(split::SplitVector<T>& input,
+             split::SplitVector<T>& output, Rule rule, split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
    size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE);
@@ -1143,9 +1235,27 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
    output.erase(&output[len], output.end());
 }
 
-template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-                  split::SplitVector<U, split::split_unified_allocator<U>>& output, Rule rule, splitStackArena&& mPool,
+template <typename T,typename Allocator,typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+void copy_if(split::SplitVector<T,Allocator>& input,
+             split::SplitVector<T,Allocator>& output, Rule rule, split_gpuStream_t s = 0) {
+
+   // Figure out Blocks to use
+   size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE);
+   size_t nBlocks = nextPow2(_s);
+   if (nBlocks == 0) {
+      nBlocks += 1;
+   }
+
+   // Allocate with Mempool
+   const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
+   splitStackArena mPool(memory_for_pool, s);
+   auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   output.erase(&output[len], output.end());
+}
+
+template <typename T, typename U,typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+void copy_keys_if(split::SplitVector<T>& input,
+                  split::SplitVector<U>& output, Rule rule, splitStackArena&& mPool,
                   split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
@@ -1159,8 +1269,8 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
 }
 
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-             split::SplitVector<T, split::split_unified_allocator<T>>& output, Rule rule, splitStackArena&& mPool,
+void copy_if(split::SplitVector<T>& input,
+             split::SplitVector<T>& output, Rule rule, splitStackArena&& mPool,
              split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
@@ -1173,9 +1283,9 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
    output.erase(&output[len], output.end());
 }
 
-template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-                  split::SplitVector<U, split::split_unified_allocator<U>>& output, Rule rule, void* stack,
+template <typename T, typename U,typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+void copy_keys_if(split::SplitVector<T>& input,
+                  split::SplitVector<U>& output, Rule rule, void* stack,
                   size_t max_size, split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
@@ -1190,9 +1300,27 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
    output.erase(&output[len], output.end());
 }
 
-template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-             split::SplitVector<T, split::split_unified_allocator<T>>& output, Rule rule, void* stack, size_t max_size,
+template <typename T, typename U,typename MapAlloc,typename KeyAlloc ,typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+void copy_keys_if(split::SplitVector<T,MapAlloc>& input,
+                  split::SplitVector<U,KeyAlloc>& output, Rule rule, void* stack,
+                  size_t max_size, split_gpuStream_t s = 0) {
+
+   // Figure out Blocks to use
+   size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE);
+   size_t nBlocks = nextPow2(_s);
+   if (nBlocks == 0) {
+      nBlocks += 1;
+   }
+   assert(stack && "Invalid stack!");
+   splitStackArena mPool(stack, max_size);
+   auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   output.erase(&output[len], output.end());
+}
+
+
+template <typename T,typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+void copy_if(split::SplitVector<T>& input,
+             split::SplitVector<T>& output, Rule rule, void* stack, size_t max_size,
              split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
diff --git a/include/splitvector/splitvec.h b/include/splitvector/splitvec.h
index fbbb08f..b96c0a1 100644
--- a/include/splitvector/splitvec.h
+++ b/include/splitvector/splitvec.h
@@ -24,11 +24,13 @@
 #pragma once
 #include "split_allocators.h"
 #include <algorithm>
+#include  <type_traits>
 #include <cassert>
 #include <iostream>
 #include <memory>
 #include <optional>
 #include <stdlib.h>
+#include <type_traits>
 #include <vector>
 
 #ifndef SPLIT_CPU_ONLY_MODE
@@ -47,7 +49,7 @@ using DefaultAllocator = split::split_unified_allocator<T>;
 #define DEVICEONLY
 #define HOSTDEVICE
 template <typename T>
-using DefaultAllocator = split::split_host_allocator<T>;
+using DefaultAllocator = std::allocator<T>;
 #endif
 
 namespace split {
@@ -84,14 +86,22 @@ template <typename T, class Allocator = DefaultAllocator<T>>
 class SplitVector {
 
 private:
+   Allocator _allocator;         // Allocator used to allocate and deallocate memory;
    T* _data = nullptr;           // actual pointer to our data
-   size_t* _size;                // number of elements in vector.
-   size_t* _capacity;            // number of allocated elements
+   SplitInfo* _info;             // stores size and capacity 
    size_t _alloc_multiplier = 2; // host variable; multiplier for  when reserving more space
-   Allocator _allocator;         // Allocator used to allocate and deallocate memory;
    Residency _location;          // Flags that describes the current residency of our data
    SplitVector* d_vec = nullptr; // device copy pointer
 
+   constexpr size_t get_number_of_Ts_for_Split_Info()const noexcept{
+      constexpr size_t size_of_T=sizeof(T);
+      constexpr size_t size_of_info=sizeof(SplitInfo);
+      if constexpr (size_of_T>size_of_info){
+         return 1;
+      }
+      return std::ceil(size_of_info/size_of_T);
+   }
+
    /**
     * @brief Checks if a pointer is valid and throws an exception if it's null.
     * @param ptr Pointer to be checked.
@@ -113,16 +123,19 @@ class SplitVector {
 
    /**
     * @brief Allocates memory for the vector on the host.
-    *
+    * There is a small hack done here to allow us to only use one allocator.
+    * We allocate memory for _info which is of type SplitInfo using an 
+    * allocator of T and properly casting. 
     * @param size Number of elements to allocate.
     * @throws std::bad_alloc If memory allocation fails.
     */
    HOSTONLY void _allocate(size_t size) {
-      _size = _allocate_and_construct(size);
-      _capacity = _allocate_and_construct(size);
-      _check_ptr(_size);
-      _check_ptr(_capacity);
-      if (size == 0) {
+      auto n = get_number_of_Ts_for_Split_Info();
+      _info = reinterpret_cast<SplitInfo*>(_allocator.allocate(n));
+      _check_ptr(_info);
+      _info->size=size;
+      _info->capacity=size;
+      if (size==0){
          return;
       }
       _data = _allocate_and_construct(size, T());
@@ -141,8 +154,8 @@ class SplitVector {
          _deallocate_and_destroy(capacity(), _data);
          _data = nullptr;
       }
-      _deallocate_and_destroy(_capacity);
-      _deallocate_and_destroy(_size);
+      _allocator.deallocate(reinterpret_cast<T*>(_info),get_number_of_Ts_for_Split_Info());
+      _info=nullptr;
    }
 
    /**
@@ -154,25 +167,14 @@ class SplitVector {
     */
    HOSTONLY T* _allocate_and_construct(size_t n, const T& val) {
       T* _ptr = _allocator.allocate(n);
-      for (size_t i = 0; i < n; i++) {
-         _allocator.construct(&_ptr[i], val);
+      if constexpr (!std::is_trivially_copy_constructible_v<T> ){
+         for (size_t i = 0; i < n; i++) {
+            _allocator.construct(&_ptr[i], val);
+         }
       }
       return _ptr;
    }
 
-   /**
-    * @brief Allocates memory and constructs metadata on the host.
-    *
-    * @param val Value to be used for construction.
-    * @return Pointer to the allocated and constructed memory.
-    */
-   HOSTONLY size_t* _allocate_and_construct(const size_t& val) {
-      size_t* _ptr = (size_t*)_allocator.allocate_raw(sizeof(size_t));
-      assert(_ptr);
-      *_ptr = val;
-      return _ptr;
-   }
-
    /**
     * @brief Deallocates memory and destroys elements on the host.
     *
@@ -180,26 +182,17 @@ class SplitVector {
     * @param _ptr Pointer to the memory to be deallocated and destroyed.
     */
    HOSTONLY void _deallocate_and_destroy(size_t n, T* _ptr) {
-      for (size_t i = 0; i < n; i++) {
-         _allocator.destroy(&_ptr[i]);
+      if constexpr (!std::is_trivially_copy_constructible_v<T>){
+         for (size_t i = 0; i < n; i++) {
+            _allocator.destroy(&_ptr[i]);
+         }
       }
       _allocator.deallocate(_ptr, n);
    }
 
-   /**
-    * @brief Deallocates memory for metadata on the host.
-    *
-    * @param ptr Pointer to the memory to be deallocated.
-    */
-   HOSTONLY void _deallocate_and_destroy(size_t* ptr) {
-      if (ptr) {
-         _allocator.deallocate(ptr, 1);
-      }
-   }
-
 public:
    /* Available Constructors :
-    *    -- SplitVector()                       --> Default constructor. Almost a no OP but _size and _capacity have  to
+    *    -- SplitVector()                       --> Default constructor. Almost a no OP but info->_size and info->_capacity have  to
     * be allocated for device usage.
     *    -- SplitVector(size_t)                 --> Instantiates a splitvector with a specific size. (capacity == size)
     *    -- SplitVector(size_t,T)               --> Instantiates a splitvector with a specific size and sets all
@@ -213,16 +206,23 @@ class SplitVector {
    /**
     * @brief Default constructor. Creates an empty SplitVector.
     */
-   HOSTONLY explicit SplitVector() : _location(Residency::host), d_vec(nullptr) {
+   HOSTONLY explicit SplitVector() :_allocator(Allocator()), _location(Residency::host), d_vec(nullptr) {
       this->_allocate(0); // seems counter-intuitive based on stl but it is not!
    }
+   
+   HOSTONLY explicit SplitVector(const Allocator& alloc) :_allocator(alloc), _location(Residency::host), d_vec(nullptr) {
+      this->_allocate(0); 
+   }
 
    /**
     * @brief Constructor to create a SplitVector of a specified size.
     *
     * @param size The size of the SplitVector to be created.
     */
-   HOSTONLY explicit SplitVector(size_t size) : _location(Residency::host), d_vec(nullptr) { this->_allocate(size); }
+   HOSTONLY explicit SplitVector(size_t size) : _allocator(Allocator()),_location(Residency::host), d_vec(nullptr) { this->_allocate(size); }
+   
+   HOSTONLY explicit SplitVector(size_t size,const Allocator& alloc) : _allocator(alloc),_location(Residency::host), d_vec(nullptr) { this->_allocate(size); }
+
 
    /**
     * @brief Constructor to create a SplitVector of a specified size with initial values.
@@ -230,7 +230,14 @@ class SplitVector {
     * @param size The size of the SplitVector to be created.
     * @param val The initial value to be assigned to each element.
     */
-   HOSTONLY explicit SplitVector(size_t size, const T& val) : _location(Residency::host), d_vec(nullptr) {
+   HOSTONLY explicit SplitVector(size_t size, const T& val) :_allocator(Allocator()), _location(Residency::host), d_vec(nullptr) {
+      this->_allocate(size);
+      for (size_t i = 0; i < size; i++) {
+         _data[i] = val;
+      }
+   }
+   
+   HOSTONLY explicit SplitVector(size_t size, const T& val,const Allocator& alloc) :_allocator(alloc), _location(Residency::host), d_vec(nullptr) {
       this->_allocate(size);
       for (size_t i = 0; i < size; i++) {
          _data[i] = val;
@@ -243,7 +250,7 @@ class SplitVector {
     * @param other The SplitVector to be copied.
     */
 #ifdef SPLIT_CPU_ONLY_MODE
-   HOSTONLY explicit SplitVector(const SplitVector<T, Allocator>& other) {
+   HOSTONLY explicit SplitVector(const SplitVector<T, Allocator>& other):_allocator(other._allocator) {
       const size_t size_to_allocate = other.size();
       this->_allocate(size_to_allocate);
       for (size_t i = 0; i < size_to_allocate; i++) {
@@ -252,7 +259,7 @@ class SplitVector {
    }
 #else
 
-   HOSTONLY explicit SplitVector(const SplitVector<T, Allocator>& other) {
+   HOSTONLY explicit SplitVector(const SplitVector<T, Allocator>& other):_allocator(other._allocator) {
       const size_t size_to_allocate = other.size();
       auto copySafe = [&]() -> void {
          for (size_t i = 0; i < size_to_allocate; i++) {
@@ -280,11 +287,12 @@ class SplitVector {
     * @param other The SplitVector to be moved from.
     */
    HOSTONLY SplitVector(SplitVector<T, Allocator>&& other) noexcept {
+      _allocator=other._allocator;
       _data = other._data;
-      *_size = other.size();
-      *_capacity = other.capacity();
-      *(other._capacity) = 0;
-      *(other._size) = 0;
+      _info->size = other.size();
+      _info->capacity = other.capacity();
+      _info->capacity = 0;
+      _info->size = 0;
       other._data = nullptr;
       _location = other._location;
       d_vec = nullptr;
@@ -295,7 +303,7 @@ class SplitVector {
     *
     * @param init_list The initializer list to initialize the SplitVector with.
     */
-   HOSTONLY explicit SplitVector(std::initializer_list<T> init_list) : _location(Residency::host), d_vec(nullptr) {
+   HOSTONLY explicit SplitVector(std::initializer_list<T> init_list) :_allocator(Allocator()), _location(Residency::host), d_vec(nullptr) {
       this->_allocate(init_list.size());
       for (size_t i = 0; i < size(); i++) {
          _data[i] = init_list.begin()[i];
@@ -307,7 +315,7 @@ class SplitVector {
     *
     * @param other The std::vector to initialize the SplitVector with.
     */
-   HOSTONLY explicit SplitVector(const std::vector<T>& other) : _location(Residency::host), d_vec(nullptr) {
+   HOSTONLY explicit SplitVector(const std::vector<T>& other) :_allocator(Allocator()), _location(Residency::host), d_vec(nullptr) {
       this->_allocate(other.size());
       for (size_t i = 0; i < size(); i++) {
          _data[i] = other[i];
@@ -395,8 +403,7 @@ class SplitVector {
             }
             SPLIT_CHECK_ERR(
                 split_gpuMemcpyAsync(_data, other._data, size() * sizeof(T), split_gpuMemcpyDeviceToDevice, stream));
-            SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_size, sizeof(size_t), device, stream));
-            SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), device, stream));
+            SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_info, sizeof(SplitInfo), device, stream));
             return;
          }
       }
@@ -421,16 +428,17 @@ class SplitVector {
 
       _deallocate_and_destroy(capacity(), _data);
       _data = other._data;
-      *_size = other.size();
-      *_capacity = other.capacity();
-      *(other._capacity) = 0;
-      *(other._size) = 0;
+      _info->size = other.size();
+      _info->capacity= other.capacity();
+      other._info->capacity = 0;
+      other._info->size = 0;
       other._data = nullptr;
       _location = other._location;
       d_vec = nullptr;
       return *this;
    }
 
+   #ifndef SPLIT_CPU_ONLY_MODE
    /**
     * @brief Custom new operator for allocation using the allocator.
     *
@@ -438,9 +446,13 @@ class SplitVector {
     * @return Pointer to the allocated memory.
     */
    HOSTONLY
-   void* operator new(size_t len) {
-      void* ptr = Allocator::allocate_raw(len);
-      return ptr;
+   void* operator new(size_t size) {
+      void* ret;
+      SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, size));
+      if (ret == nullptr) {
+         throw std::bad_alloc();
+      }
+      return ret;
    }
 
    /**
@@ -449,7 +461,9 @@ class SplitVector {
     * @param ptr Pointer to the memory to deallocate.
     */
    HOSTONLY
-   void operator delete(void* ptr) { Allocator::deallocate(ptr, 1); }
+   void operator delete(void* ptr) {
+      SPLIT_CHECK_ERR(split_gpuFree(ptr));
+    }
 
    /**
     * @brief Custom new operator for array allocation using the allocator.
@@ -458,9 +472,12 @@ class SplitVector {
     * @return Pointer to the allocated memory.
     */
    HOSTONLY
-   void* operator new[](size_t len) {
-      void* ptr = Allocator::allocate_raw(len);
-      return ptr;
+   void* operator new[](size_t size) {
+      void* ret;
+      SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, size));
+      if (ret == nullptr) {
+         throw std::bad_alloc();
+      }
    }
 
    /**
@@ -469,9 +486,8 @@ class SplitVector {
     * @param ptr Pointer to the memory to deallocate.
     */
    HOSTONLY
-   void operator delete[](void* ptr) { Allocator::deallocate(ptr); }
-
-#ifndef SPLIT_CPU_ONLY_MODE
+   void operator delete[](void* ptr) { SPLIT_CHECK_ERR(split_gpuFree(ptr));}
+   
    /**
     * @brief Uploads the SplitVector to the GPU.
     *
@@ -507,18 +523,17 @@ class SplitVector {
       int device;
       SPLIT_CHECK_ERR(split_gpuGetDevice(&device));
 
-      // First make sure _capacity does not page-fault ie prefetch it to host
-      // This is done because _capacity would page-fault otherwise as pointed by Markus
-      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), split_gpuCpuDeviceId, stream));
+      // First make sure info->_capacity does not page-fault ie prefetch it to host
+      // This is done because info->_capacity would page-fault otherwise as pointed by Markus
+      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_info, sizeof(SplitInfo), split_gpuCpuDeviceId, stream));
       SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream));
-      if (*_capacity == 0) {
+      if (_info->capacity == 0) {
          return;
       }
 
       // Now prefetch everything to device
       SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_data, capacity() * sizeof(T), device, stream));
-      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_size, sizeof(size_t), device, stream));
-      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), device, stream));
+      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_info, sizeof(SplitInfo), device, stream));
    }
 
    /**
@@ -528,10 +543,9 @@ class SplitVector {
     */
    HOSTONLY void optimizeCPU(split_gpuStream_t stream = 0) noexcept {
       _location = Residency::host;
-      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), split_gpuCpuDeviceId, stream));
-      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_size, sizeof(size_t), split_gpuCpuDeviceId, stream));
+      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_info, sizeof(SplitInfo), split_gpuCpuDeviceId, stream));
       SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream));
-      if (*_capacity == 0) {
+      if (_info->capacity == 0) {
          return;
       }
       SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_data, capacity() * sizeof(T), split_gpuCpuDeviceId, stream));
@@ -544,9 +558,8 @@ class SplitVector {
     * @param flags Flags for memory attachment.
     */
    HOSTONLY void streamAttach(split_gpuStream_t s, uint32_t flags = split_gpuMemAttachSingle) {
-      SPLIT_CHECK_ERR(split_gpuStreamAttachMemAsync(s, (void*)_size, sizeof(size_t), flags));
-      SPLIT_CHECK_ERR(split_gpuStreamAttachMemAsync(s, (void*)_capacity, sizeof(size_t), flags));
-      SPLIT_CHECK_ERR(split_gpuStreamAttachMemAsync(s, (void*)_data, *_capacity * sizeof(T), flags));
+      SPLIT_CHECK_ERR(split_gpuStreamAttachMemAsync(s, (void*)_info, sizeof(SplitInfo), flags));
+      SPLIT_CHECK_ERR(split_gpuStreamAttachMemAsync(s, (void*)_data, _info->capacity * sizeof(T), flags));
       return;
    }
 
@@ -566,8 +579,7 @@ class SplitVector {
     * @param s The GPU stream to perform the copy on.
     */
    HOSTONLY void copyMetadata(SplitInfo* dst, split_gpuStream_t s = 0) {
-      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&dst->size, _size, sizeof(size_t), split_gpuMemcpyDeviceToHost, s));
-      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&dst->capacity, _capacity, sizeof(size_t), split_gpuMemcpyDeviceToHost, s));
+      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&dst->size, _info, sizeof(SplitInfo), split_gpuMemcpyDeviceToHost, s));
    }
 
    /**
@@ -581,12 +593,10 @@ class SplitVector {
       if (device == -1) {
          SPLIT_CHECK_ERR(split_gpuGetDevice(&device));
       }
-      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), split_gpuCpuDeviceId, stream));
+      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_info, sizeof(SplitInfo), split_gpuCpuDeviceId, stream));
       SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream));
       SPLIT_CHECK_ERR(split_gpuMemAdvise(_data, capacity() * sizeof(T), advice, device));
-      SPLIT_CHECK_ERR(split_gpuMemAdvise(_size, sizeof(size_t), advice, device));
-      SPLIT_CHECK_ERR(split_gpuMemAdvise(_capacity, sizeof(size_t), advice, device));
-      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), device, stream));
+      SPLIT_CHECK_ERR(split_gpuMemAdvise(_info, sizeof(SplitInfo), advice, device));
    }
 #endif
 
@@ -602,8 +612,7 @@ class SplitVector {
          return;
       }
       split::swap(_data, other._data);
-      split::swap(_size, other._size);
-      split::swap(_capacity, other._capacity);
+      split::swap(_info, other._info);
       split::swap(_allocator, other._allocator);
       return;
    }
@@ -614,7 +623,7 @@ class SplitVector {
     *
     * @return Number of elements in the container.
     */
-   HOSTDEVICE const size_t& size() const noexcept { return *_size; }
+   HOSTDEVICE size_t size() const noexcept { return _info->size; }
 
    /**
     * @brief Bracket accessor for accessing elements by index without bounds check.
@@ -680,8 +689,8 @@ class SplitVector {
             _deallocate_and_destroy(capacity(), _data);
          }
          _data = nullptr;
-         *_capacity = 0;
-         *_size = 0;
+         _info->capacity = 0;
+         _info->size = 0;
          return;
       }
       T* _new_data;
@@ -703,7 +712,7 @@ class SplitVector {
       // Swap pointers & update capacity
       // Size remains the same ofc
       _data = _new_data;
-      *_capacity = requested_space;
+      _info->capacity = requested_space;
       return;
    }
 
@@ -718,18 +727,20 @@ class SplitVector {
     * will be invalidated after a call.
     */
    void reserve(size_t requested_space, bool eco = false) {
-      size_t current_space = *_capacity;
+      size_t current_space = _info->capacity;
       // Vector was default initialized
       if (_data == nullptr) {
          _deallocate();
          _allocate(requested_space);
-         *_size = 0;
+         _info->size = 0;
          return;
       }
       // Nope.
       if (requested_space <= current_space) {
-         for (size_t i = size(); i < requested_space; ++i) {
-            _allocator.construct(&_data[i], T());
+         if constexpr (!std::is_trivial<T>::value) {
+            for (size_t i = size(); i < requested_space; ++i) {
+               _allocator.construct(&_data[i], T());
+            }
          }
          return;
       }
@@ -756,11 +767,11 @@ class SplitVector {
    void resize(size_t newSize, bool eco = false) {
       // Let's reserve some space and change our size
       if (newSize <= size()) {
-         *_size = newSize;
+         _info->size = newSize;
          return;
       }
       reserve(newSize, eco);
-      *_size = newSize;
+      _info->size = newSize;
       // TODO: should it set entries to zero?
    }
 
@@ -773,8 +784,8 @@ class SplitVector {
     * @brief Reduce the capacity of the SplitVector to match its size.
     */
    void shrink_to_fit() {
-      size_t curr_cap = *_capacity;
-      size_t curr_size = *_size;
+      size_t curr_cap = _info->capacity;
+      size_t curr_size = _info->size;
 
       if (curr_cap == curr_size) {
          return;
@@ -794,8 +805,8 @@ class SplitVector {
    HOSTONLY
    void reallocate(size_t requested_space, split_gpuStream_t stream = 0) {
       // Store addresses
-      const size_t __size = *_size;
-      const size_t __old_capacity = *_capacity;
+      const size_t __size = _info->size;
+      const size_t __old_capacity = _info->capacity;
       T* __old_data = _data;
       // Verify allocation sufficiency
       if (__size > requested_space) {
@@ -809,8 +820,8 @@ class SplitVector {
             _deallocate_and_destroy(__old_capacity, __old_data);
          }
          _data = nullptr;
-         *_capacity = 0;
-         *_size = 0;
+         _info->capacity = 0;
+         _info->size = 0;
          return;
       }
       T* _new_data;
@@ -823,7 +834,7 @@ class SplitVector {
       T* __new_data = _new_data;
       // Swap pointers & update capacity
       _data = _new_data;
-      *_capacity = requested_space;
+      _info->capacity = requested_space;
       // Perform copy on device
       if (__size > 0) {
          SPLIT_CHECK_ERR(
@@ -852,11 +863,11 @@ class SplitVector {
       if (_data == nullptr) {
          _deallocate();
          _allocate(requested_space);
-         *_size = 0;
+         _info->size = 0;
          return;
       }
       // Already has sufficient capacity?
-      const size_t current_space = *_capacity;
+      const size_t current_space = _info->capacity;
       if (requested_space <= current_space) {
          return;
       }
@@ -883,13 +894,19 @@ class SplitVector {
     */
    HOSTONLY
    void resize(size_t newSize, bool eco = false, split_gpuStream_t stream = 0) {
+      if (_data==nullptr){
+         _data=_allocator.allocate(newSize);
+         _info->size = newSize;
+         _info->capacity = newSize;
+         return;
+      }
       // Let's reserve some space and change our size
       if (newSize <= size()) {
-         *_size = newSize;
+         _info->size = newSize;
          return;
       }
       reserve(newSize, eco, stream);
-      *_size = newSize;
+      _info->size = newSize;
       // TODO: should it set entries to zero?
    }
 
@@ -899,16 +916,16 @@ class SplitVector {
     * @param newSize The new size of the SplitVector.
     */
    DEVICEONLY
-   void device_resize(size_t newSize, bool construct = true) {
+   void device_resize(size_t newSize,bool _construct=true) {
       if (newSize > capacity()) {
          assert(0 && "Splitvector has a catastrophic failure trying to resize on device.");
       }
-      if (construct) {
+      if constexpr (!std::is_trivially_copy_constructible_v<T> ){
          for (size_t i = size(); i < newSize; ++i) {
             _allocator.construct(&_data[i], T());
          }
       }
-      *_size = newSize;
+      _info->size = newSize;
    }
 
    /**
@@ -922,8 +939,8 @@ class SplitVector {
     */
    HOSTONLY
    void shrink_to_fit(split_gpuStream_t stream = 0) {
-      size_t curr_cap = *_capacity;
-      size_t curr_size = *_size;
+      size_t curr_cap = _info->capacity;
+      size_t curr_size = _info->size;
 
       if (curr_cap == curr_size) {
          return;
@@ -958,7 +975,7 @@ class SplitVector {
             (_data + --i)->~T();
          }
       }
-      *_size = end;
+      _info->size = end;
    }
 
    /**
@@ -971,7 +988,7 @@ class SplitVector {
             _data[i].~T();
          }
       }
-      *_size = 0;
+      _info->size = 0;
       return;
    }
 
@@ -981,7 +998,7 @@ class SplitVector {
     * @return The capacity of the SplitVector.
     */
    HOSTDEVICE
-   inline size_t capacity() const noexcept { return *_capacity; }
+   inline size_t capacity() const noexcept { return _info->capacity; }
 
    /**
     * @brief Get a reference to the last element of the SplitVector.
@@ -989,7 +1006,7 @@ class SplitVector {
     * @return Reference to the last element.
     */
    HOSTDEVICE
-   T& back() noexcept { return _data[*_size - 1]; }
+   T& back() noexcept { return _data[_info->size - 1]; }
 
    /**
     * @brief Get a const reference to the last element of the SplitVector.
@@ -997,7 +1014,7 @@ class SplitVector {
     * @return Const reference to the last element.
     */
    HOSTDEVICE
-   const T& back() const noexcept { return _data[*_size - 1]; }
+   const T& back() const noexcept { return _data[_info->size - 1]; }
 
    /**
     * @brief Get a reference to the first element of the SplitVector.
@@ -1030,16 +1047,17 @@ class SplitVector {
     */
    HOSTONLY
    void push_back(const T& val) {
-      // If we have no allocated memory because the default ctor was used then
-      // allocate one element, set it and return
-      if (_data == nullptr) {
-         *this = SplitVector<T, Allocator>(1, val);
-         return;
-      }
       resize(size() + 1);
       _data[size() - 1] = val;
       return;
    }
+   
+   HOSTONLY
+   void push_back_unsafe(const T& val) {
+      _data[size() - 1] = val;
+      _info->size++;
+      return;
+   }
 
    /**
     * @brief Push a moved element to the back of the SplitVector.
@@ -1051,10 +1069,11 @@ class SplitVector {
 
       // If we have no allocated memory because the default ctor was used then
       // allocate one element, set it and return
-      if (_data == nullptr) {
-         *this = SplitVector<T, Allocator>(1, std::move(val));
-         return;
-      }
+      // if (_data == nullptr) {
+      //    _allocate(1);
+      //    _data[size() - 1] = val;
+      //    return;
+      // }
       resize(size() + 1);
       _data[size() - 1] = std::move(val);
       return;
@@ -1068,9 +1087,9 @@ class SplitVector {
     */
    DEVICEONLY
    bool device_push_back(const T& val) {
-      size_t old = atomicAdd((unsigned int*)_size, 1);
+      size_t old = atomicAdd((unsigned int*)(&_info->size), 1);
       if (old >= capacity() - 1) {
-         atomicSub((unsigned int*)_size, 1);
+         atomicSub((unsigned int*)&_info->size, 1);
          return false;
       }
       atomicCAS(&(_data[old]), _data[old], val);
@@ -1087,9 +1106,9 @@ class SplitVector {
 
       // We need at least capacity=size+1 otherwise this
       // pushback cannot be done
-      size_t old = atomicAdd((unsigned int*)_size, 1);
+      size_t old = atomicAdd((unsigned int*)&_info->size, 1);
       if (old >= capacity() - 1) {
-         atomicSub((unsigned int*)_size, 1);
+         atomicSub((unsigned int*)&_info->size, 1);
          return false;
       }
       atomicCAS(&(_data[old]), _data[old], std::move(val));
@@ -1286,7 +1305,7 @@ class SplitVector {
          _data[i + 1] = _data[i];
       }
       _data[index] = val;
-      *_size = *_size + 1;
+      _info->size = _info->size + 1;
       return iterator(_data + index);
    }
 
@@ -1483,7 +1502,7 @@ class SplitVector {
             new (&_data[i]) T(_data[i + 1]);
          }
       }
-      *_size -= 1;
+      _info->size -= 1;
       iterator retval = &_data[index];
       return retval;
    }
@@ -1515,7 +1534,7 @@ class SplitVector {
             new (&_data[i]) T(_data[i + range]);
          }
       }
-      *_size -= end - start;
+      _info->size -= end - start;
       iterator it = &_data[start];
       return it;
    }
@@ -1540,8 +1559,13 @@ class SplitVector {
       resize(size() + 1);
       iterator it = &_data[index];
       std::move(it.data(), end().data(), it.data() + 1);
-      _allocator.destroy(it.data());
-      _allocator.construct(it.data(), args...);
+      if constexpr (!std::is_trivial<T>::value) {
+         _allocator.destroy(it.data());
+         _allocator.construct(it.data(), args...);
+      }else{
+         //just forward these guys and assign sinced they are trivial types
+         *it = T(std::forward<Args>(args)...);
+      }
       return it;
    }
 
diff --git a/meson.build b/meson.build
index 5c10eb6..f79df0d 100644
--- a/meson.build
+++ b/meson.build
@@ -5,8 +5,9 @@ add_global_arguments('-DHASHMAPDEBUG', language : 'cpp')
 add_global_arguments('-DHASHMAPDEBUG', language : 'cuda')
 add_project_arguments(['--expt-relaxed-constexpr','--expt-extended-lambda' ], language: 'cuda')
 
-#Dependencies
+ #Dependencies
 gtest_dep = dependency('gtest', fallback : ['gtest', 'gtest_dep'])
+umpire_dep = dependency('umpire' ,required:false)
 
 #Limit register usage in debug builds
 if ( get_option('buildtype') == 'debug')
@@ -31,8 +32,16 @@ tombstoneTest = executable('tbPerf', 'unit_tests/benchmark/tbPerf.cu', dependenc
 realisticTest = executable('realistic', 'unit_tests/benchmark/realistic.cu', dependencies :gtest_dep)
 hybridGPU = executable('hybrid_gpu', 'unit_tests/hybrid/main.cu',dependencies :gtest_dep )
 
+if umpire_dep.found()
+  hashinator_umpire_unit = executable('hashmap_umpire_test', 'unit_tests/umpire_tests/hashinator.cu',dependencies :[umpire_dep,gtest_dep])
+  splitvector_host_umpire_unit = executable('splitvector_host_umpire_test', 'unit_tests/umpire_tests/vec_host.cu',dependencies :[umpire_dep,gtest_dep] )
+  splitvector_device_umpire_unit = executable('splitvector_device_umpire_test', 'unit_tests/umpire_tests/vec_device.cu',dependencies :[umpire_dep,gtest_dep])
+  test('HashinatorTestUmpire', hashinator_umpire_unit)
+  test('SplitVectorUmpireHost', splitvector_host_umpire_unit)
+  test('SplitVectorUmpireDevice', splitvector_device_umpire_unit)
+endif
 
-#Test-Runner
+#Test-Runners
 test('HashinatorTest', hashinator_unit)
 test('SplitVectorDeviceTest', splitvector_device_unit)
 test('SplitVectorHostTest', splitvector_host_unit)
diff --git a/unit_tests/gtest_vec_host/vec_test.cu b/unit_tests/gtest_vec_host/vec_test.cu
index efd65a6..a461ee4 100644
--- a/unit_tests/gtest_vec_host/vec_test.cu
+++ b/unit_tests/gtest_vec_host/vec_test.cu
@@ -13,10 +13,10 @@
 #define expect_eq EXPECT_EQ
 #define N 1<<12
 
-typedef split::SplitVector<int> vec ;
+typedef split::SplitVector<int,std::allocator<int>> vec ;
 typedef std::vector<int> stdvec ;
-typedef split::SplitVector<split::SplitVector<int>> vec2d ;
-typedef split::SplitVector<int>::iterator   split_iterator;
+typedef split::SplitVector<split::SplitVector<int,std::allocator<int>>,std::allocator<split::SplitVector<int,std::allocator<int>>>> vec2d ;
+typedef split::SplitVector<int,std::allocator<int>>::iterator   split_iterator;
 
 
 
@@ -228,7 +228,7 @@ TEST(Vector_Functionality , Push_Back_2){
 
 TEST(Vector_Functionality , Insert_1_Element){
    {
-      split::SplitVector<int> a{1,2,3,4,5,6,7,8,9,10};
+      vec a{1,2,3,4,5,6,7,8,9,10};
       auto s0=a.size(); auto c0=a.capacity();
       auto it(a.begin());
       auto it2=a.insert(it,-1);
@@ -282,7 +282,7 @@ TEST(Vector_Functionality , Insert_1_Element){
 TEST(Vector_Functionality , Insert_Many_Elements){
 
    {
-      split::SplitVector<int> a{1,2,3,4,5,6,7,8,9,10};
+      vec a{1,2,3,4,5,6,7,8,9,10};
       auto s0=a.size(); auto c0=a.capacity();
       auto it(a.begin());
       auto it2=a.insert(it,10,-1);
@@ -330,9 +330,9 @@ TEST(Vector_Functionality , Insert_Many_Elements){
 TEST(Vector_Functionality , Insert_Range_Based){
 
    {
-      split::SplitVector<int> a{1,2,3,4,5,6,7,8,9,10};
+      vec a{1,2,3,4,5,6,7,8,9,10};
       auto backup(a);
-      split::SplitVector<int> b{-1,-2,-3,-4,-5,-6,-7,-8,-9,-10};
+      vec b{-1,-2,-3,-4,-5,-6,-7,-8,-9,-10};
       auto s0=a.size();
       auto it(a.end());
       auto it_b0(b.begin());
@@ -350,9 +350,9 @@ TEST(Vector_Functionality , Insert_Range_Based){
 
 
    {
-      split::SplitVector<int> a{1,2,3,4,5,6,7,8,9,10};
+      vec a{1,2,3,4,5,6,7,8,9,10};
       auto backup(a);
-      split::SplitVector<int> b{-1,-2,-3,-4,-5,-6,-7,-8,-9,-10};
+      vec b{-1,-2,-3,-4,-5,-6,-7,-8,-9,-10};
       auto s0=a.size();
       auto it(a.end());
       auto it_b0(b.begin());
@@ -369,16 +369,16 @@ TEST(Vector_Functionality , Insert_Range_Based){
 
 
 TEST(Vector_Functionality , Erase_Single){
-      split::SplitVector<int> a{1,2,3,4,5,6,7,8,9,10};
-      split::SplitVector<int> b{1,2,3,5,6,7,8,9,10};
+      vec a{1,2,3,4,5,6,7,8,9,10};
+      vec b{1,2,3,5,6,7,8,9,10};
       split_iterator it0=&a[3];
       a.erase(it0);
       expect_true(a==b);
 }
 
 TEST(Vector_Functionality , Erase_Range){
-      split::SplitVector<int> a{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14};
-      split::SplitVector<int> b{0,1,7,8,9,10,11,12,13,14};
+      vec a{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14};
+      vec b{0,1,7,8,9,10,11,12,13,14};
       split_iterator it0=&a[2];
       split_iterator it1=&a[7];
       a.erase(it0,it1);
diff --git a/unit_tests/hashmap_unit_test/main.cu b/unit_tests/hashmap_unit_test/main.cu
index 7d6f3d8..2da028a 100644
--- a/unit_tests/hashmap_unit_test/main.cu
+++ b/unit_tests/hashmap_unit_test/main.cu
@@ -1084,16 +1084,7 @@ TEST(HashmapUnitTets ,Test_Duplicate_Insertion){
    }
 }
 
-TEST(HashmapUnitTets ,EMPTY_TOMBSTONE_values){
-   Hashmap<key_type,val_type,42,42> hmap;
-   const auto tombstone = hmap.get_tombstone();
-   const auto emptybucket = hmap.get_emptybucket();
-   expect_true(tombstone==emptybucket);
-   //These can be also checked during compile time 
-   static_assert(tombstone==emptybucket);
-}
-
-int main(int argc, char* argv[]){
+ main(int argc, char* argv[]){
    srand(time(NULL));
    ::testing::InitGoogleTest(&argc, argv);
    return RUN_ALL_TESTS();
diff --git a/unit_tests/umpire_tests/hashinator.cu b/unit_tests/umpire_tests/hashinator.cu
new file mode 100644
index 0000000..ac5e396
--- /dev/null
+++ b/unit_tests/umpire_tests/hashinator.cu
@@ -0,0 +1,996 @@
+#include <iostream>
+#include <stdlib.h>
+#include <chrono>
+#include <random>
+#include "../../include/hashinator/hashinator.h"
+#include <gtest/gtest.h>
+#include <random>
+#include <algorithm>
+#include <limits.h>
+#include "include/hashinator/hash_pair.h"
+#include "umpire/Allocator.hpp"
+#include "umpire/ResourceManager.hpp"
+#include "umpire/TypedAllocator.hpp"
+
+
+
+
+
+#define BLOCKSIZE 32
+#define expect_true EXPECT_TRUE
+#define expect_false EXPECT_FALSE
+#define expect_eq EXPECT_EQ
+constexpr int MINPOWER = 10;
+constexpr int MAXPOWER = 11;
+
+
+using namespace std::chrono;
+using namespace Hashinator;
+typedef uint32_t val_type;
+typedef uint32_t key_type;
+
+
+static umpire::TypedAllocator<key_type>* vector_key_alloc;
+static umpire::TypedAllocator<hash_pair<key_type,val_type>>* vector_map_alloc;
+typedef split::SplitVector<hash_pair<key_type,val_type>,umpire::TypedAllocator<hash_pair<key_type,val_type>>> vector ;
+typedef split::SplitVector<key_type,umpire::TypedAllocator<key_type>> ivector ;
+
+
+
+// typedef split::SplitVector<hash_pair<key_type,val_type>> vector ;
+// typedef split::SplitVector<key_type> ivector ;
+typedef Hashmap<key_type,val_type,umpire::TypedAllocator<hash_pair<key_type,val_type>>> hashmap;
+
+
+struct Predicate{
+   HASHINATOR_HOSTDEVICE
+      inline bool operator()( hash_pair<key_type,val_type>& element)const{
+         return element.second%2==0;
+      }
+};
+
+template <class Fn, class ... Args>
+auto execute_and_time(const char* name,Fn fn, Args && ... args) ->bool{
+   std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::_V2::system_clock::duration> start,stop;
+   double total_time=0;
+   start = std::chrono::high_resolution_clock::now();
+   bool retval=fn(args...);
+   stop = std::chrono::high_resolution_clock::now();
+   auto duration = duration_cast<microseconds>(stop- start).count();
+   total_time+=duration;
+   //std::cout<<name<<" took "<<total_time<<" us"<<std::endl;
+   (void)name;
+   return retval;
+}
+
+
+void create_input(vector& src, uint32_t bias=0){
+   for (size_t i=0; i<src.size(); ++i){
+      hash_pair<key_type,val_type>& kval=src.at(i);
+      kval.first=i + bias;
+      kval.second=i;
+   }
+}
+
+
+void cpu_write(hashmap& hmap, vector& src){
+   for (size_t i=0; i<src.size(); ++i){
+      const hash_pair<key_type,val_type>& kval=src.at(i);
+      hmap.at(kval.first)=kval.second;
+   }
+}
+
+__global__ 
+void gpu_write(hashmap* hmap, hash_pair<key_type,val_type>*src, size_t N){
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   if (index < N ){
+      hmap->set_element(src[index].first, src[index].second);
+   }
+}
+
+
+__global__ 
+void gpu_remove_insert(hashmap* hmap, hash_pair<key_type,val_type>*rm,  hash_pair<key_type,val_type>*add, size_t N){
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   if (index  ==0 ){
+      for ( int i =0; i <N ;++i ){
+         hash_pair<key_type,val_type>elem=rm[i];
+         auto rmval=hmap->read_element(elem.first);
+         hmap->device_erase(elem.first);
+      }
+      for ( int i =0; i <N ;++i ){
+         hash_pair<key_type,val_type>elem=add[i];
+         hmap->set_element(elem.first,elem.second);
+      }
+   }
+}
+
+
+__global__
+void gpu_delete_even(hashmap* hmap, hash_pair<key_type,val_type>*src,size_t N){
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   if (index<N ){
+      auto kpos=hmap->device_find(src[index].first);
+      if (kpos==hmap->device_end()){assert(0 && "Catastrophic crash in deletion");}
+      if (kpos->second %2==0 ){
+         int retval=hmap->device_erase(kpos->first);
+         assert(retval==1 && "Failed to erase!");
+         retval=hmap->device_erase(kpos->first);
+         assert(retval==0 && "Failed to not  erase!");
+
+      }
+   }
+   return;
+}
+
+
+
+__global__
+void gpu_recover_odd_elements(hashmap* hmap,hash_pair<key_type,val_type>* src,size_t N ){
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   if (index < N ){
+      key_type key= src[index].first;
+      val_type val= src[index].second;
+      if (val%2!=0){
+         auto it=hmap->device_find(key);
+         if (it==hmap->device_end()){
+            assert( 0 && "Failed in GPU RECOVER ALL ");
+         }
+         if (it->first!=key || it->second!=val){
+            assert( 0 && "Failed in GPU RECOVER ALL ");
+         }
+      }
+   }
+
+   //Iterate over all elements with 1 thread and check for evens;
+   if (index==0){
+      for (auto it=hmap->device_begin(); it!=hmap->device_end(); ++it){
+         if (it->second%2==0 ){
+            printf("Found even when there should not be any!\n");
+            assert(0);
+         }
+      }
+   }
+   return;
+}
+
+bool recover_odd_elements(const hashmap& hmap, vector& src){
+   for (size_t i=0; i<src.size(); ++i){
+      const hash_pair<key_type,val_type>& kval=src.at(i);
+      if (kval.second%2!=0){
+         auto retval=hmap.find(kval.first);
+         if (retval==hmap.end()){return false;}
+         bool sane=retval->first==kval.first  &&  retval->second== kval.second ;
+         if (!sane){ 
+            return false; 
+         }
+      }
+   }
+   return true;
+}
+
+bool recover_all_elements(const hashmap& hmap, vector& src){
+   for (size_t i=0; i<src.size(); ++i){
+      const hash_pair<key_type,val_type>& kval=src.at(i);
+      //std::cout<<"Validating "<<kval.first<<std::endl;
+      auto retval=hmap.find(kval.first);
+      if (retval==hmap.end()){
+         std::cout<<"INVALID= "<<kval.first<<std::endl;
+         return false;
+      }
+      bool sane=retval->first==kval.first  &&  retval->second== kval.second ;
+      if (!sane){ 
+         return false; 
+      }
+      //std::cout<<"Key validated "<<retval->first<<" "<<retval->second<<std::endl;
+   }
+   return true;
+}
+
+bool recover_odd_elements(hashmap* hmap, vector& src){
+   for (size_t i=0; i<src.size(); ++i){
+      const hash_pair<key_type,val_type>& kval=src.at(i);
+      if (kval.second%2!=0){
+         auto retval=hmap->find(kval.first);
+         if (retval==hmap->end()){return false;}
+         bool sane=retval->first==kval.first  &&  retval->second== kval.second ;
+         if (!sane){ 
+            return false; 
+         }
+      }
+   }
+   return true;
+}
+
+bool recover_all_elements(hashmap* hmap, vector& src){
+   for (size_t i=0; i<src.size(); ++i){
+      const hash_pair<key_type,val_type>& kval=src.at(i);
+      auto retval=hmap->find(kval.first);
+      if (retval==hmap->end()){return false;}
+      bool sane=retval->first==kval.first  &&  retval->second== kval.second ;
+      if (!sane){ 
+         return false; 
+      }
+   }
+   return true;
+}
+
+__global__
+void gpu_recover_all_elements(hashmap* hmap,hash_pair<key_type,val_type>* src,size_t N  ){
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   if (index < N ){
+      key_type key= src[index].first;
+      val_type val= src[index].second;
+      auto it=hmap->device_find(key);
+      if (it==hmap->device_end()){
+         printf("END FOUND DEVICE\n");
+         assert( 0 && "Failed in GPU RECOVER ALL ");
+      }
+      if (it->first!=key || it->second!=val){
+         assert( 0 && "Failed in GPU RECOVER ALL ");
+      }
+   }
+   return;
+}
+
+__global__
+void gpu_recover_warpWide(hashmap* hmap,hash_pair<key_type,val_type>* src,size_t N  ){
+
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t wid = index / Hashinator::defaults::WARPSIZE;
+   const size_t w_tid = index % defaults::WARPSIZE;
+   if (wid < N ){
+      key_type key= src[wid].first;
+      val_type retval;;
+      val_type val= src[wid].second;
+      hmap->warpFind(key,retval,w_tid);
+      assert(retval==val);
+   }
+}
+
+__global__
+void gpu_recover_non_existant_key_warpWide(hashmap* hmap,hash_pair<key_type,val_type>* src,size_t N  ){
+
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t wid = index / Hashinator::defaults::WARPSIZE;
+   const size_t w_tid = index % defaults::WARPSIZE;
+   if (wid < N ){
+      val_type retval=42;;
+      key_type key=42 ;
+      hmap->warpFind(key,retval,w_tid);
+      assert(retval==42);
+   }
+}
+
+__global__
+void gpu_write_warpWide(hashmap* hmap,hash_pair<key_type,val_type>* src,size_t N  ){
+
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t wid = index / Hashinator::defaults::WARPSIZE;
+   const size_t w_tid = index % defaults::WARPSIZE;
+   if (wid < N ){
+      key_type key= src[wid].first;
+      val_type val= src[wid].second;
+      hmap->warpInsert(key,val,w_tid);
+   }
+}
+
+__global__
+void gpu_write_warpWide_UnorderedSet(hashmap* hmap,hash_pair<key_type,val_type>* src,size_t N  ){
+
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t wid = index / Hashinator::defaults::WARPSIZE;
+   const size_t w_tid = index % defaults::WARPSIZE;
+   if (wid < N ){
+      key_type key= src[wid].first;
+      val_type val= src[wid].second;
+      hmap->warpInsert<1>(key,val,w_tid);
+   }
+}
+
+__global__
+void gpu_write_warpWide_Duplicate(hashmap* hmap,hash_pair<key_type,val_type>* src,size_t N  ){
+
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t wid = index / Hashinator::defaults::WARPSIZE;
+   const size_t w_tid = index % defaults::WARPSIZE;
+   if (wid < N ){
+      key_type key= src[0].first;
+      val_type val= src[0].second;
+      hmap->warpInsert(key,val,w_tid);
+   }
+}
+
+__global__
+void gpu_erase_warpWide(hashmap* hmap,hash_pair<key_type,val_type>* src,size_t N  ){
+
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t wid = index / Hashinator::defaults::WARPSIZE;
+   const size_t w_tid = index % defaults::WARPSIZE;
+   if (wid < N ){
+      key_type key= src[wid].first;
+      hmap->warpErase(key,w_tid);
+   }
+}
+
+__global__
+void gpu_write_warpWide_V(hashmap* hmap,hash_pair<key_type,val_type>* src,size_t N  ){
+
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t wid = index / Hashinator::defaults::WARPSIZE;
+   const size_t w_tid = index % defaults::WARPSIZE;
+   if (wid < N ){
+      key_type key= src[wid].first;
+      val_type val= src[wid].second;
+      auto retval=hmap->warpInsert_V(key,val,w_tid);
+      assert(retval);
+   }
+}
+
+bool testWarpInsert(int power){
+   size_t N = 1<<power;
+   size_t blocksize=BLOCKSIZE;
+   size_t blocks=N/blocksize;
+   size_t warpsize     =  Hashinator::defaults::WARPSIZE;
+   size_t threadsNeeded  =  N*warpsize; 
+   blocks = threadsNeeded/BLOCKSIZE;
+
+   bool cpuOK=true;
+
+   //Create some input data
+   vector src(N,*vector_map_alloc);
+   create_input(src);
+   hashmap* hmap=new hashmap(*vector_map_alloc);
+   hmap->resize(power+1);
+
+   //Upload to device and insert input
+   gpu_write_warpWide<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   //Verify all elements
+   cpuOK=recover_all_elements(*hmap,src);
+   if (!cpuOK){
+      return false;
+   }
+
+   //duplicate test
+   {
+      size_t N = 1<<power;
+      size_t warpsize     =  Hashinator::defaults::WARPSIZE;
+      size_t threadsNeeded  =  N*warpsize; 
+      blocks = threadsNeeded/BLOCKSIZE;
+      //Create some input data
+      vector src(N,*vector_map_alloc);
+      create_input(src);
+      hashmap* hmap=new hashmap(*vector_map_alloc);
+      hmap->resize(power+1);
+      //Upload to device and insert input
+      gpu_write_warpWide_Duplicate<<<1,1024>>>(hmap,src.data(),1);
+      SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+      if (hmap->size()!=1){
+         return false;
+      }
+   }
+
+   return true;
+}
+
+bool testWarpInsertUnorderedSet(int power){
+   size_t N = 1<<power;
+   size_t blocksize=BLOCKSIZE;
+   size_t blocks=N/blocksize;
+   size_t warpsize     =  Hashinator::defaults::WARPSIZE;
+   size_t threadsNeeded  =  N*warpsize; 
+   blocks = threadsNeeded/BLOCKSIZE;
+
+   bool cpuOK=true;
+
+   //Create some input data
+   vector src(N,*vector_map_alloc);
+   create_input(src);
+   hashmap* hmap=new hashmap(*vector_map_alloc);
+   hmap->resize(power+1);
+
+   //Upload to device and insert input
+   gpu_write_warpWide_UnorderedSet<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   //Verify all elements
+   cpuOK=recover_all_elements(*hmap,src);
+   if (!cpuOK){
+      return false;
+   }
+
+
+   //Now we change the key values and increment them by 1 and we expect the same keys back because we are not supposed to overwrite
+   vector src2(src);
+   for (auto& i:src2){
+      i.second++;
+   }
+
+   //Upload to device and insert input
+   gpu_write_warpWide_UnorderedSet<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   //Verify all elements
+   cpuOK=recover_all_elements(*hmap,src);
+   if (!cpuOK){
+      return false;
+   }
+
+   //duplicate test
+   {
+      size_t N = 1<<power;
+      size_t warpsize     =  Hashinator::defaults::WARPSIZE;
+      size_t threadsNeeded  =  N*warpsize; 
+      blocks = threadsNeeded/BLOCKSIZE;
+      //Create some input data
+      vector src(N,*vector_map_alloc);
+      create_input(src);
+      hashmap* hmap=new hashmap(*vector_map_alloc);
+      hmap->resize(power+1);
+      //Upload to device and insert input
+      gpu_write_warpWide_Duplicate<<<1,1024>>>(hmap,src.data(),1);
+      SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+      if (hmap->size()!=1){
+         return false;
+      }
+   }
+
+   return true;
+}
+
+bool testWarpInsert_V(int power){
+   size_t N = 1<<power;
+   size_t blocksize=BLOCKSIZE;
+   size_t blocks=N/blocksize;
+   size_t warpsize     =  Hashinator::defaults::WARPSIZE;
+   size_t threadsNeeded  =  N*warpsize; 
+   blocks = threadsNeeded/BLOCKSIZE;
+
+   bool cpuOK=true;
+
+   //Create some input data
+   vector src(N,*vector_map_alloc);
+   create_input(src);
+   hashmap* hmap=new hashmap(*vector_map_alloc);
+   hmap->resize(power+1);
+
+   //Upload to device and insert input
+   gpu_write_warpWide_V<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   //Verify all elements
+   cpuOK=recover_all_elements(*hmap,src);
+   if (!cpuOK){
+      return false;
+   }
+   return true;
+}
+
+bool testWarpErase(int power){
+   size_t N = 1<<power;
+   size_t blocksize=BLOCKSIZE;
+   size_t blocks=N/blocksize;
+   size_t warpsize     =  Hashinator::defaults::WARPSIZE;
+   size_t threadsNeeded  =  N*warpsize; 
+   blocks = threadsNeeded/BLOCKSIZE;
+
+
+   //Create some input data
+   vector src(N,*vector_map_alloc);
+   create_input(src);
+   hashmap* hmap=new hashmap(*vector_map_alloc);
+   hmap->resize(power+1);
+
+   //Upload to device and insert input
+   gpu_write_warpWide_V<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   //Upload to device and insert input
+   gpu_erase_warpWide<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   if (hmap->size()!=0){
+      return false;
+   }
+
+   return true;
+}
+
+
+bool test_hashmap_1(int power){
+   size_t N = 1<<power;
+   size_t blocksize=BLOCKSIZE;
+   size_t blocks=2*N/blocksize;
+
+   bool cpuOK=true;
+
+   //Create some input data
+   vector src(N,*vector_map_alloc);
+   create_input(src);
+   hashmap hmap(*vector_map_alloc);
+   hashmap* d_hmap;
+   hmap.resize(power+1);
+
+   //Upload to device and insert input
+   d_hmap=hmap.upload();
+   gpu_write<<<blocks,blocksize>>>(d_hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   hmap.download();
+
+   //Verify all elements
+   cpuOK=recover_all_elements(hmap,src);
+   gpu_recover_all_elements<<<blocks,blocksize>>>(d_hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   return true;
+   if (!cpuOK){
+      return false;
+   }
+
+   //Delete some selection of the source data
+   d_hmap=hmap.upload();
+   gpu_delete_even<<<blocks,blocksize>>>(d_hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   hmap.download();
+
+   //Quick check to verify there are no even elements
+   for (const auto& kval : hmap){
+      if (kval.second%2==0){
+         std::cout<<kval.first<<" "<<kval.second<<std::endl;
+         return false;
+      }
+   }
+
+   //Verify odd elements;
+   cpuOK=recover_odd_elements(hmap,src);
+   gpu_recover_odd_elements<<<blocks,blocksize>>>(d_hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   if (!cpuOK){
+      return false;
+   }
+
+   //Reinsert so that we can also test duplicate insertion
+   d_hmap=hmap.upload();
+   gpu_write<<<blocks,blocksize>>>(d_hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   //Download
+   hmap.download();
+
+
+   //Verify all elements
+   cpuOK=recover_all_elements(hmap,src);
+   gpu_recover_all_elements<<<blocks,blocksize>>>(d_hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   if (!cpuOK ){
+      return false;
+   }
+
+   //If we made it to here we should be ok 
+   return true;
+}
+
+
+bool test_hashmap_2(int power){
+   size_t N = 1<<power;
+   size_t blocksize=BLOCKSIZE;
+   size_t blocks=2*N/blocksize;
+   bool cpuOK=true;
+
+   //Create some input data
+   vector src(N,*vector_map_alloc);
+   create_input(src);
+
+
+   hashmap* hmap = new hashmap(*vector_map_alloc);
+   hmap->resize(power+1);
+
+   //Upload to device and insert input
+   gpu_write<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   //Verify all elements
+   cpuOK=recover_all_elements(hmap,src);
+   gpu_recover_all_elements<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   if (!cpuOK ){
+      return false;
+   }
+
+   //Delete some selection of the source data
+   gpu_delete_even<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+
+   //Upload to device and insert input
+   gpu_write<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   //Upload to device and insert input
+   gpu_write<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+
+   //Delete some selection of the source data
+   gpu_delete_even<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   //Quick check to verify there are no even elements
+   for (const auto& kval : *hmap){
+      if (kval.second%2==0){
+         std::cout<<kval.first<<" "<<kval.second<<std::endl;
+         return false;
+      }
+   }
+
+   //Verify odd elements;
+   cpuOK=recover_odd_elements(hmap,src);
+   gpu_recover_odd_elements<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   //SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   if (!cpuOK){
+      return false;
+   }
+
+   //Clean Tomstones and reinsert so that we can also test duplicate insertion
+   hmap->clean_tombstones();
+   gpu_write<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   //Verify all elements
+   cpuOK=recover_all_elements(hmap,src);
+   gpu_recover_all_elements<<<blocks,blocksize>>>(hmap,src.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   if (!cpuOK ){
+      return false;
+   }
+
+   vector src2(N,*vector_map_alloc);
+   create_input(src2);
+   gpu_remove_insert<<<1,1>>>(hmap,src.data(),src2.data(),src.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   gpu_recover_all_elements<<<blocks,blocksize>>>(hmap,src2.data(),src2.size());
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   delete hmap;
+   hmap=nullptr;
+   return true;
+}
+
+bool test_hashmap_3(int power){
+   size_t N = 1<<power;
+
+   //Create some input data
+   vector src(N,*vector_map_alloc);
+   create_input(src);
+   hashmap hmap(*vector_map_alloc);
+   bool cpuOK;
+
+   for (auto i : src){
+      hmap.insert(i);
+   }
+
+   cpuOK=recover_all_elements(hmap,src);
+   if (!cpuOK){
+      std::cout<<"Error at recovering all elements 1"<<std::endl;
+      return false;
+   }
+
+   for (auto i:hmap){
+      if (i.second%2==0){
+         hmap.erase(i.first);
+      }
+   }
+
+   cpuOK=recover_odd_elements(hmap,src);
+   if (!cpuOK){
+      std::cout<<"Error at recovering odd elements 2"<<std::endl;
+      return false;
+   }
+
+   for (auto i : src){
+      hmap.insert(i);
+   }
+
+   cpuOK=recover_all_elements(hmap,src);
+   if (!cpuOK){
+      std::cout<<"Error at recovering all elements 2"<<std::endl;
+      return false;
+   }
+   return true;
+}
+
+
+TEST(HashmapUnitTets , Test_Construction){
+   hashmap map0(12,*vector_map_alloc);
+   expect_true(map0.size()==0);
+   for (key_type i=0 ; i< 1<<11; i++){
+      map0[i]=i;
+   }
+   expect_true(map0.size()==1<<11);
+   hashmap map1(*vector_map_alloc);
+   map1=map0;
+   expect_true(map1.size()==1<<11);
+   hashmap map2(*vector_map_alloc);
+   map2 = map0;
+   expect_true(map2.size()==1<<11);
+   
+}
+
+
+
+TEST(HashmapUnitTets , Test1_HostDevice_UploadDownload){
+   for (int power=MINPOWER; power<MAXPOWER; ++power){
+      std::string name= "Power= "+std::to_string(power);
+      bool retval = execute_and_time(name.c_str(),test_hashmap_1 ,power);
+      expect_true(retval);
+   }
+}
+
+
+
+TEST(HashmapUnitTets , Test1_HostDevice_WarpInsert){
+   for (int power=MINPOWER; power<MAXPOWER; ++power){
+      std::string name= "Power= "+std::to_string(power);
+      bool retval = execute_and_time(name.c_str(),testWarpInsert ,power);
+      expect_true(retval);
+   }
+}
+
+TEST(HashmapUnitTets , Test1_HostDevice_WarpInsertUnorderedSet){
+   for (int power=MINPOWER; power<MAXPOWER; ++power){
+      std::string name= "Power= "+std::to_string(power);
+      bool retval = execute_and_time(name.c_str(),testWarpInsertUnorderedSet ,power);
+      expect_true(retval);
+   }
+}
+
+TEST(HashmapUnitTets , Test1_HostDevice_WarpInsert_V){
+   for (int power=MINPOWER; power<MAXPOWER; ++power){
+      std::string name= "Power= "+std::to_string(power);
+      bool retval = execute_and_time(name.c_str(),testWarpInsert_V ,power);
+      expect_true(retval);
+   }
+}
+
+TEST(HashmapUnitTets , Test1_HostDevice_WarpErase){
+   for (int power=MINPOWER; power<MAXPOWER; ++power){
+      std::string name= "Power= "+std::to_string(power);
+      bool retval = execute_and_time(name.c_str(),testWarpErase ,power);
+      expect_true(retval);
+   }
+}
+
+TEST(HashmapUnitTets , Test2_HostDevice_New_Unified_Ptr){
+   for (int power=MINPOWER; power<MAXPOWER; ++power){
+      std::string name= "Power= "+std::to_string(power);
+      bool retval = execute_and_time(name.c_str(),test_hashmap_2 ,power);
+      expect_true(retval);
+   }
+}
+
+TEST(HashmapUnitTets , Test3_Host){
+   for (int power=MINPOWER; power<MAXPOWER; ++power){
+      std::string name= "Power= "+std::to_string(power);
+      bool retval = execute_and_time(name.c_str(),test_hashmap_3 ,power);
+      expect_true(retval);
+   }
+}
+
+
+TEST(HashmapUnitTets ,Test_Clear_Perf_Host){
+
+   const int sz=22;
+   vector src(1<<sz,*vector_map_alloc);
+   create_input(src);
+   hashmap hmap(sz,*vector_map_alloc);
+   bool cpuOK;
+   hmap.insert(src.data(),src.size());
+   cpuOK=recover_all_elements(hmap,src);
+   if (!cpuOK){
+      std::cout<<"Error at recovering all elements 1"<<std::endl;
+      expect_true(false);
+   }
+   hmap.optimizeGPU();
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::_V2::system_clock::duration> start,stop;
+   start = std::chrono::high_resolution_clock::now();
+   hmap.clear();
+   stop = std::chrono::high_resolution_clock::now();
+   auto duration = duration_cast<microseconds>(stop- start).count();
+   //std::cout<<"Clear took "<<duration<<" us status= "<<hmap.peek_status()<<std::endl;
+}
+
+TEST(HashmapUnitTets ,Test_Clear_Perf_Device){
+
+   const int sz=22;
+   vector src(1<<sz,*vector_map_alloc);
+   create_input(src);
+   hashmap hmap(sz,*vector_map_alloc);
+   bool cpuOK;
+   hmap.insert(src.data(),src.size());
+   cpuOK=recover_all_elements(hmap,src);
+   if (!cpuOK){
+      std::cout<<"Error at recovering all elements 1"<<std::endl;
+      expect_true(false);
+   }
+   hmap.optimizeGPU();
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::_V2::system_clock::duration> start,stop;
+   start = std::chrono::high_resolution_clock::now();
+   hmap.clear(targets::device);
+   stop = std::chrono::high_resolution_clock::now();
+   auto duration = duration_cast<microseconds>(stop- start).count();
+   //std::cout<<"Clear took "<<duration<<" us status= "<<hmap.peek_status()<<std::endl;
+}
+
+TEST(HashmapUnitTets ,Test_Resize_Perf_Host){
+
+   const int sz=24;
+   vector src(1<<sz,*vector_map_alloc);
+   create_input(src);
+   hashmap hmap(sz,*vector_map_alloc);
+   bool cpuOK;
+   hmap.insert(src.data(),src.size());
+   cpuOK=recover_all_elements(hmap,src);
+   if (!cpuOK){
+      std::cout<<"Error at recovering all elements 1"<<std::endl;
+      expect_true(false);
+   }
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::_V2::system_clock::duration> start,stop;
+   start = std::chrono::high_resolution_clock::now();
+   hmap.resize(sz+2);
+   stop = std::chrono::high_resolution_clock::now();
+   auto duration = duration_cast<microseconds>(stop- start).count();
+   //std::cout<<"Resize took "<<duration<<" us status= "<<hmap.peek_status()<<std::endl;
+}
+
+
+TEST(HashmapUnitTets ,Test_Resize_Perf_Device){
+
+   const int sz=24;
+   vector src(1<<sz,*vector_map_alloc);
+   create_input(src);
+   hashmap hmap(sz,*vector_map_alloc);
+   bool cpuOK;
+   hmap.insert(src.data(),src.size());
+   cpuOK=recover_all_elements(hmap,src);
+   if (!cpuOK){
+      std::cout<<"Error at recovering all elements 1"<<std::endl;
+      expect_true(false);
+   }
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::_V2::system_clock::duration> start,stop;
+   start = std::chrono::high_resolution_clock::now();
+   hmap.resize(sz+2,targets::device);
+   stop = std::chrono::high_resolution_clock::now();
+   auto duration = duration_cast<microseconds>(stop- start).count();
+   //std::cout<<"Resize took "<<duration<<" us"<<std::endl;
+   expect_true(hmap.peek_status()==status::success);
+}
+
+
+template <typename T, typename U>
+struct Rule{
+   Rule(){}
+   __host__ __device__
+      inline bool operator()( hash_pair<T,U>& element)const{
+         return element.first<1000;
+      }
+};
+
+
+TEST(HashmapUnitTets ,Test_ErrorCodes_ExtractKeysByPattern){
+   const int sz=5;
+   vector src(1<<sz,*vector_map_alloc);
+   create_input(src);
+   hashmap hmap(*vector_map_alloc);
+   hmap.insert(src.data(),src.size());
+   bool cpuOK=recover_all_elements(hmap,src);
+   expect_true(cpuOK);
+   expect_true(hmap.peek_status()==status::success);
+   ivector out(*vector_key_alloc);
+   hmap.extractKeysByPattern(out,Rule<key_type,key_type>());
+   for (auto i:out){
+      expect_true(i<1000);
+   }
+}
+
+TEST(HashmapUnitTets ,Test_ErrorCodes_ExtractKeysByPatternNoAllocations){
+   const int sz=5;
+   vector src(1<<sz,*vector_key_alloc);
+   create_input(src);
+   hashmap hmap(*vector_map_alloc);
+   hmap.insert(src.data(),src.size());
+   bool cpuOK=recover_all_elements(hmap,src);
+   expect_true(cpuOK);
+   expect_true(hmap.peek_status()==status::success);
+   ivector out1(*vector_key_alloc),out2(*vector_key_alloc);
+   hmap.extractKeysByPattern(out1,Rule<key_type,key_type>());
+
+   void* buffer=nullptr;
+   size_t mem=2*sizeof(key_type)*(1<<sz);
+   SPLIT_CHECK_ERR (split_gpuMallocManaged( (void**)&buffer ,mem));
+   hmap.extractKeysByPattern(out2, Rule<key_type,key_type>(),buffer,mem);
+
+   expect_true(out1==out2);
+   SPLIT_CHECK_ERR (split_gpuFree(buffer));
+}
+
+
+TEST(HashmapUnitTets ,Test_Copy_Metadata){
+   const int sz=18;
+   vector src(1<<sz,*vector_key_alloc);
+   create_input(src);
+   hashmap hmap(*vector_map_alloc);
+   hmap.insert(src.data(),src.size());
+   bool cpuOK=recover_all_elements(hmap,src);
+   expect_true(cpuOK);
+   expect_true(hmap.peek_status()==status::success);
+   Info info;
+   hmap.copyMetadata(&info);
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   expect_true(1<<info.sizePower==hmap.bucket_count());
+   expect_true(info.tombstoneCounter==hmap.tombstone_count());
+
+}
+
+std::vector<key_type> generateUniqueRandomKeys(size_t size, size_t range=std::numeric_limits<int>::max()) {
+   std::vector<key_type> elements;
+   std::random_device rd;
+   std::mt19937 gen(rd());
+   std::uniform_int_distribution<int> dist(1, range);
+
+   for (size_t i = 0; i < size; ++i) {
+      key_type randomNum = i;//dist(gen);
+      if (std::find(elements.begin(), elements.end(), randomNum) == elements.end()) {
+         elements.push_back(randomNum);
+      } else {
+         --i;  
+      }
+   }
+   return elements;
+}
+
+void insertDuplicates(std::vector<key_type>& vec, key_type element, size_t count) {
+   if (count>0){
+      vec.insert(vec.end(), count, element);
+   }
+   srand(time(NULL));
+   std::random_shuffle(vec.begin(),vec.end());
+}
+
+TEST(HashmapUnitTets ,Test_Duplicate_Insertion){
+   const int sz=10;
+   for (size_t duplicates=2; duplicates<=(1<<sz);duplicates*=2){
+      std::vector<key_type> keys=generateUniqueRandomKeys(1<<sz);
+
+      for (size_t i = 0; i < duplicates;i++){
+         insertDuplicates(keys,keys[0],1);
+      }
+
+      vector src(keys.size(),*vector_map_alloc);
+      for (size_t i =0;i<keys.size(); i++){
+         src[i].first=keys[i];
+         src[i].second=keys[i];
+      }
+      hashmap hmap(*vector_map_alloc);
+      hmap.insert(src.data(),src.size(),1);
+      bool cpuOK=recover_all_elements(hmap,src);
+      expect_true(cpuOK);
+      expect_true(hmap.peek_status()==status::success);
+      expect_true(hmap.size()==((1<<sz)));
+   }
+}
+
+ main(int argc, char* argv[]){
+   auto& rm = umpire::ResourceManager::getInstance();
+   umpire::Allocator alloc = rm.getAllocator("MANAGED");
+   auto v1=umpire::TypedAllocator<key_type>(alloc);   
+   auto v2=umpire::TypedAllocator<hash_pair<key_type,val_type >>(alloc);   
+   vector_key_alloc=&v1;
+   vector_map_alloc=&v2;
+   srand(time(NULL));
+   ::testing::InitGoogleTest(&argc, argv);
+   return RUN_ALL_TESTS();
+}
diff --git a/unit_tests/umpire_tests/vec_device.cu b/unit_tests/umpire_tests/vec_device.cu
new file mode 100644
index 0000000..bdaeda4
--- /dev/null
+++ b/unit_tests/umpire_tests/vec_device.cu
@@ -0,0 +1,294 @@
+#include <iostream>
+#include <stdlib.h>
+#include <chrono>
+#include <gtest/gtest.h>
+#include "../../include/splitvector/splitvec.h"
+#include "../../include/splitvector/split_tools.h"
+#include "umpire/Allocator.hpp"
+#include "umpire/ResourceManager.hpp"
+#include "umpire/TypedAllocator.hpp"
+
+
+#define expect_true EXPECT_TRUE
+#define expect_false EXPECT_FALSE
+#define expect_eq EXPECT_EQ
+#define N 1<<12
+
+static umpire::TypedAllocator<int>* vector_alloc;
+typedef split::SplitVector<int,umpire::TypedAllocator<int>> vec ;
+// typedef split::SplitVector<int,split::split_unified_allocator<int>> vec ;
+
+
+__global__
+void add_vectors(vec* a , vec* b,vec* c){
+
+   int index = blockIdx.x * blockDim.x + threadIdx.x;
+   if (index< a->size()){
+      c->at(index)=a->at(index)+b->at(index);
+   }
+
+}
+
+
+__global__
+void resize_vector(vec* a , int size){
+   a->device_resize(size);
+}
+
+
+__global__
+void push_back_kernel(vec* a){
+
+   int index = blockIdx.x * blockDim.x + threadIdx.x;
+   a->device_push_back(index);
+}
+
+__global__
+void merge_kernel(vec* a,vec *b ){
+
+   int index = blockIdx.x * blockDim.x + threadIdx.x;
+   if (index==0){
+      a->device_insert(a->end(),b->begin(),b->end());
+   }
+}
+
+__global__
+void merge_kernel_2(vec* a){
+
+   int index = blockIdx.x * blockDim.x + threadIdx.x;
+   if (index==0){
+      a->device_insert(a->begin()++,3,42);
+   }
+}
+
+__global__
+void erase_kernel(vec* a){
+   auto it=a->begin();
+   a->erase(it);
+   
+}
+
+
+
+void print_vec_elements(const vec& v){
+   std::cout<<"****Vector Contents********"<<std::endl;
+   std::cout<<"Size= "<<v.size()<<std::endl;
+   std::cout<<"Capacity= "<<v.capacity()<<std::endl;
+   for (const auto i:v){
+      std::cout<<i<<" ";
+   }
+
+   std::cout<<"\n****~Vector Contents********"<<std::endl;
+}
+
+TEST(Test_GPU,VectorAddition){
+   vec a(N,1,*vector_alloc);
+   vec b(N,2,*vector_alloc);
+   vec c(N,0,*vector_alloc);
+   
+   vec* d_a=a.upload();
+   vec* d_b=b.upload();
+   vec* d_c=c.upload();
+
+   add_vectors<<<N,32>>>(d_a,d_b,d_c);
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+
+   for (const auto& e:c){
+      expect_true(e==3);
+   }
+
+
+}
+
+TEST(Constructors,Default){
+   vec a(*vector_alloc);
+   expect_true(a.size()==0 && a.capacity()==0);
+   expect_true(a.data()==nullptr);
+}
+
+TEST(Constructors,Size_based){
+   vec a(N,*vector_alloc);
+   expect_true(a.size()==N && a.capacity()==N);
+   expect_true(a.data()!=nullptr);
+}
+
+
+TEST(Constructors,Specific_Value){
+   vec a(N,5,*vector_alloc);
+   expect_true(a.size()==N && a.capacity()==N);
+   for (size_t i=0; i<N;i++){
+      expect_true(a[i]==5);
+      expect_true(a.at(i)==5);
+   }
+}
+
+TEST(Constructors,Copy){
+   vec a(N,5,*vector_alloc);
+   vec b(a);
+   for (size_t i=0; i<N;i++){
+      expect_true(a[i]==b[i]);
+      expect_true(a.at(i)==b.at(i));
+   }
+}
+
+TEST(Vector_Functionality , Reserve){
+   vec a(*vector_alloc);
+   size_t cap =1000000;
+   a.reserve(cap);
+   expect_true(a.size()==0);
+   expect_true(a.capacity()==cap);
+}
+
+TEST(Vector_Functionality , Resize){
+   vec a(*vector_alloc);
+   size_t size =1<<20;
+   a.resize(size);
+   expect_true(a.size()==size);
+   expect_true(a.capacity()==a.size());
+}
+
+TEST(Vector_Functionality , Swap){
+   vec a(10,2,*vector_alloc);
+   vec b(10,2,*vector_alloc);
+   a.swap(b);
+   vec c(100,1,*vector_alloc);
+   vec d (200,3,*vector_alloc);
+   c.swap(d);
+   expect_true(c.size()==200);
+   expect_true(d.size()==100);
+   expect_true(c.front()==3);
+   expect_true(d.front()==1);
+
+}
+
+TEST(Vector_Functionality , Resize2){
+   vec a(*vector_alloc);
+   size_t size =1<<20;
+   a.resize(size);
+   expect_true(a.size()==size);
+   expect_true(a.capacity()==a.size());
+}
+
+TEST(Vector_Functionality , Clear){
+   vec a(10,*vector_alloc);
+   size_t size =1<<20;
+   a.resize(size);
+   expect_true(a.size()==size);
+   auto cap=a.capacity();
+   a.clear();
+   expect_true(a.size()==0);
+   expect_true(a.capacity()==cap);
+}
+
+
+TEST(Vector_Functionality , Push_Back){
+   vec a(*vector_alloc);
+   for (auto i=a.begin(); i!=a.end();i++){
+      expect_true(false);
+   }
+
+   size_t initial_size=a.size();
+   size_t initial_cap=a.capacity();
+
+   a.push_back(11);
+   expect_true(11==a[a.size()-1]);
+   a.push_back(12);
+   expect_true(12==a[a.size()-1]);
+
+}
+
+
+TEST(Vector_Functionality , Shrink_to_Fit){
+   vec a(*vector_alloc);
+   for (auto i=a.begin(); i!=a.end();i++){
+      expect_true(false);
+   }
+
+   size_t initial_size=a.size();
+   size_t initial_cap=a.capacity();
+
+   for (int i =0 ; i< 1024; i++){
+      a.push_back(i);
+   }
+
+   expect_true(a.size()<a.capacity());
+   a.shrink_to_fit();
+   expect_true(a.size()==a.capacity());
+
+}
+
+TEST(Vector_Functionality , PushBack_And_Erase_Device){
+      vec a(*vector_alloc);
+      a.reserve(100);
+      vec* d_a=a.upload();
+      push_back_kernel<<<4,8>>>(d_a);
+      SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+      vec* d_b=a.upload();
+      erase_kernel<<<1,1>>>(d_b);
+      SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+}
+
+
+
+TEST(Vector_Functionality , Resizing_Device){
+
+   {
+      vec a(32,42,*vector_alloc);
+      expect_true(a.size()==a.capacity());
+      a.resize(16);
+      expect_true(a.size()==16);
+      expect_true(a.capacity()==32);
+   }
+
+   {
+      vec a(32,42,*vector_alloc);
+      expect_true(a.size()==a.capacity());
+      vec* d_a=a.upload();
+      resize_vector<<<1,1>>>(d_a,16);
+      SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+      expect_true(a.size()==16);
+      expect_true(a.capacity()==32);
+   }
+
+
+   {
+      vec a(32,42,*vector_alloc);
+      expect_true(a.size()==a.capacity());
+      a.reserve(100);
+      expect_true(a.capacity()>100);
+      vec* d_a=a.upload();
+      resize_vector<<<1,1>>>(d_a,64);
+      SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+      expect_true(a.size()==64);
+      expect_true(a.capacity()>100);
+      for (size_t i = 0 ; i< a.size(); ++i){
+         a.at(i)=3;
+         expect_true(a.at(i)=3);
+      }
+   }
+}
+
+TEST(Vector_Functionality , Test_CopyMetaData){
+
+   vec a(32,42,*vector_alloc);
+   expect_true(a.size()==a.capacity());
+   a.resize(16);
+   expect_true(a.size()==16);
+   expect_true(a.capacity()==32);
+   split::SplitInfo* info;
+   SPLIT_CHECK_ERR( split_gpuMallocHost((void **) &info, sizeof(split::SplitInfo)) );
+   a.copyMetadata(info);
+   SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );
+   expect_true(a.capacity()==info->capacity);
+   expect_true(a.size()==info->size);
+}
+
+
+int main(int argc, char* argv[]){
+   auto& rm = umpire::ResourceManager::getInstance();
+   umpire::Allocator alloc = rm.getAllocator("MANAGED");
+   auto va=umpire::TypedAllocator<int>(alloc);   
+   vector_alloc=&va;
+   ::testing::InitGoogleTest(&argc, argv);
+   return RUN_ALL_TESTS();
+}
diff --git a/unit_tests/umpire_tests/vec_host.cu b/unit_tests/umpire_tests/vec_host.cu
new file mode 100644
index 0000000..6c8ce89
--- /dev/null
+++ b/unit_tests/umpire_tests/vec_host.cu
@@ -0,0 +1,176 @@
+#include <iostream>
+#include <stdlib.h>
+#include <chrono>
+#include <vector>
+#include <gtest/gtest.h>
+#ifndef SPLIT_CPU_ONLY_MODE
+#define  SPLIT_CPU_ONLY_MODE
+#endif
+#include "../../include/splitvector/splitvec.h"
+#include "umpire/Allocator.hpp"
+#include "umpire/ResourceManager.hpp"
+#include "umpire/TypedAllocator.hpp"
+
+
+#define expect_true EXPECT_TRUE
+#define expect_false EXPECT_FALSE
+#define expect_eq EXPECT_EQ
+#define N 1<<12
+
+
+
+static umpire::TypedAllocator<int>* vector_alloc;
+typedef split::SplitVector<int,umpire::TypedAllocator<int>> vec ;
+typedef std::vector<int,umpire::TypedAllocator<int>> stdvec ;
+typedef split::SplitVector<int,umpire::TypedAllocator<int>>::iterator   split_iterator;
+
+
+
+template<typename VECTOR>
+void print_vec_elements(VECTOR& v){
+   std::cout<<"****Vector Contents********"<<std::endl;
+   std::cout<<"Size= "<<v.size()<<std::endl;
+   std::cout<<"Capacity= "<<v.capacity()<<std::endl;
+   for (auto i:v){
+      std::cout<<i<<",";
+   }
+   std::cout<<std::endl;
+   std::cout<<"****~Vector Contents********"<<std::endl;
+}
+
+TEST(Constructors,Move){
+   vec b(*vector_alloc);
+   for (size_t i=0; i<N;i++){
+      b.push_back(2);
+   }
+   for (size_t i=0 ; i<N; ++i){
+      expect_true(b[i]=2);
+   }
+}
+
+
+
+TEST(Constructors,Size_based){
+   vec a(N,*vector_alloc);
+   expect_true(a.size()==N && a.capacity()==N);
+   expect_true(a.data()!=nullptr);
+}
+
+
+TEST(Constructors,Specific_Value){
+   vec a(N,5,*vector_alloc);
+   expect_true(a.size()==N && a.capacity()==N);
+   for (size_t i=0; i<N;i++){
+      expect_true(a[i]==5);
+      expect_true(a.at(i)==5);
+   }
+}
+
+
+TEST(Vector_Functionality , Reserve){
+   vec a(*vector_alloc);
+   size_t cap =1000000;
+   a.reserve(cap);
+   expect_true(a.size()==0);
+   expect_true(a.capacity()==cap);
+}
+
+TEST(Vector_Functionality , Reserve2){
+
+   for (int i =1; i<100; i++){
+      vec a(N,i,*vector_alloc);
+      vec b(a);
+
+      size_t cap =32*N;
+      a.reserve(cap);
+      expect_true(a==b);
+   }
+}
+
+TEST(Vector_Functionality , Resize){
+   vec a(*vector_alloc);
+   size_t size =1<<20;
+   a.resize(size);
+   expect_true(a.size()==size);
+   expect_true(a.capacity()==a.size());
+}
+
+TEST(Vector_Functionality , Swap){
+   vec a(10,2,*vector_alloc),b(10,2,*vector_alloc);
+   a.swap(b);
+   expect_true(a==b);
+   vec c(100,1,*vector_alloc);
+   vec d (200,3,*vector_alloc);
+   c.swap(d);
+   expect_true(c.size()==200);
+   expect_true(d.size()==100);
+   expect_true(c.front()==3);
+   expect_true(d.front()==1);
+
+}
+
+TEST(Vector_Functionality , Resize2){
+   vec a(*vector_alloc);
+   size_t size =1<<20;
+   a.resize(size);
+   expect_true(a.size()==size);
+   expect_true(a.capacity()==a.size());
+}
+
+TEST(Vector_Functionality , Clear){
+   vec a(10,*vector_alloc);
+   size_t size =1<<20;
+   a.resize(size);
+   expect_true(a.size()==size);
+   auto cap=a.capacity();
+   a.clear();
+   expect_true(a.size()==0);
+   expect_true(a.capacity()==cap);
+}
+
+
+TEST(Vector_Functionality , Push_Back){
+   vec a(*vector_alloc);
+   for (auto i=a.begin(); i!=a.end();i++){
+      expect_true(false);
+   }
+
+   size_t initial_size=a.size();
+   size_t initial_cap=a.capacity();
+
+   a.push_back(11);
+   expect_true(11==a[a.size()-1]);
+   a.push_back(12);
+   expect_true(12==a[a.size()-1]);
+
+}
+
+
+TEST(Vector_Functionality , Shrink_to_Fit){
+   vec a(*vector_alloc);
+   for (auto i=a.begin(); i!=a.end();i++){
+      expect_true(false);
+   }
+
+   size_t initial_size=a.size();
+   size_t initial_cap=a.capacity();
+
+   for (int i =0 ; i< 1024; i++){
+      a.push_back(i);
+   }
+
+   expect_true(a.size()<a.capacity());
+   a.shrink_to_fit();
+   expect_true(a.size()==a.capacity());
+
+}
+
+int main(int argc, char* argv[]){
+   auto& rm = umpire::ResourceManager::getInstance();
+   umpire::Allocator alloc = rm.getAllocator("HOST");
+   auto va=umpire::TypedAllocator<int>(alloc);   
+   vector_alloc=&va;
+   ::testing::InitGoogleTest(&argc, argv);
+   return RUN_ALL_TESTS();
+}
+