CUDA Unified Memory: remove EMULATE option

Since we already use cudaMallocManaged now for the non-emulate path, it will just work anyway.
crtrott · Aug 21, 2024 · 62527f6 · 62527f6
1 parent 0e2d3a3
commit 62527f6
Show file tree

Hide file tree

Showing 6 changed files with 0 additions and 27 deletions.
diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in
@@ -38,7 +38,6 @@
 #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
 #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
 #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
-#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY
 #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
 #cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY

diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake
@@ -48,7 +48,6 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda
 # resolved but we keep the option around a bit longer to be safe.
 KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
 KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
-KOKKOS_ENABLE_OPTION(IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting")
 KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA")
 KOKKOS_ENABLE_OPTION(IMPL_CUDA_EMULATE_UNIFIED_MEMORY OFF "Whether to emulate unified memory architectures for CUDA in non-Grace Hopper systems")
 KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4    ON "Whether code deprecated in major release 4 is available" )

diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -177,12 +177,6 @@ void *impl_allocate_common(const int device_id,
   cudaError_t error_code = cudaSuccess;
 #ifndef CUDART_VERSION
 #error CUDART_VERSION undefined!
-#elif defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
-  // This is intended to simulate Grace-Hopper-like behavior
-  error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
-  if (error_code == cudaSuccess) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
-  }
 #elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
   // This is intended for Grace-Hopper (and future unified memory architectures)
   // The idea is to use host allocator and then advise to keep it in HBM on the

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -719,10 +719,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
   os << "  KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY: ";
   os << "yes\n";
 #endif
-#ifdef KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY
-  os << "  KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY: ";
-  os << "yes\n";
-#endif
 
   os << "\nCuda Runtime Configuration:\n";
 

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -324,13 +324,7 @@ class CudaInternal {
   template <bool setCudaDevice = true>
   cudaError_t cuda_malloc_wrapper(void** devPtr, size_t size) const {
     if constexpr (setCudaDevice) set_cuda_device();
-#if !defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
     return cudaMalloc(devPtr, size);
-#else
-    auto ptr = cudaMallocManaged(devPtr, size, cudaMemAttachGlobal);
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
-    return ptr;
-#endif
   }
 
   template <bool setCudaDevice = true>

diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp
@@ -550,15 +550,6 @@ static constexpr bool kokkos_omp_on_host() { return false; }
 #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
 #endif
 
-#if defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
-#define KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
-#endif
-
-// TODO: enable the following when we are sure it is the right thing to do
-// #if defined(KOKKOS_ARCH_ARMV9_GRACE) && defined(KOKKOS_ARCH_HOPPER90)
-// #define KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
-// #endif
-
 #define KOKKOS_INVALID_INDEX (~std::size_t(0))
 
 #define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX