From e37266378ca15e409e741a895852c4a861f6ba19 Mon Sep 17 00:00:00 2001
From: Tom Benson <30674819+benson31@users.noreply.github.com>
Date: Thu, 30 Jul 2020 10:57:56 -0700
Subject: [PATCH 01/36] Update all files to use Hydrogen 1.4 API (#1588)

* Update all files to use Hydrogen 1.4 API

* prevent linking with a ROCm-enabled Hydrogen

Once LBANN supports ROCm, obviously this will change.

* update bamboo build script

* missed a couple, apparently
---
 CMakeLists.txt                                   | 12 ++++++++----
 bamboo/compiler_tests/build_script.sh            |  2 +-
 include/lbann/layers/transform/concatenate.hpp   |  4 ++--
 include/lbann/utils/cuda.hpp                     |  2 +-
 include/lbann/utils/impl/cuda.hpp                |  8 ++++----
 src/callbacks/profiler.cpp                       |  2 +-
 src/callbacks/sync_layers.cpp                    |  2 +-
 src/comm.cpp                                     |  4 ++--
 src/layers/activations/elu.cu                    |  4 ++--
 src/layers/activations/leaky_relu.cu             |  4 ++--
 src/layers/activations/log_softmax.cu            |  8 ++++----
 src/layers/activations/softmax.cu                |  8 ++++----
 src/layers/data_type_distconv_adapter.cpp        | 16 ++++++++--------
 src/layers/data_type_layer.cpp                   |  8 ++++----
 src/layers/image/bilinear_resize.cu              |  2 +-
 src/layers/io/input/input_layer.cpp              |  4 ++--
 src/layers/learning/base_convolution.cpp         |  4 ++--
 src/layers/learning/channelwise_scale_bias.cu    |  4 ++--
 src/layers/learning/embedding.cu                 |  4 ++--
 src/layers/learning/entrywise_scale_bias.cu      |  4 ++--
 src/layers/loss/categorical_accuracy.cu          |  4 ++--
 src/layers/loss/cross_entropy.cu                 |  8 ++++----
 src/layers/loss/entrywise.cu                     |  4 ++--
 src/layers/loss/l1_norm.cu                       |  8 ++++----
 src/layers/loss/l2_norm2.cu                      |  8 ++++----
 src/layers/loss/mean_absolute_error.cu           |  8 ++++----
 src/layers/loss/mean_squared_error.cu            |  8 ++++----
 src/layers/loss/top_k_categorical_accuracy.cu    |  4 ++--
 src/layers/math/binary.cu                        |  4 ++--
 src/layers/math/clamp.cu                         |  4 ++--
 src/layers/math/matmul.cpp                       |  4 ++--
 src/layers/misc/channelwise_mean.cu              |  4 ++--
 src/layers/misc/channelwise_softmax.cu           | 12 ++++++------
 src/layers/misc/covariance.cu                    |  6 +++---
 src/layers/misc/dist_embedding.cu                |  6 +++---
 src/layers/misc/one_hot.cu                       |  2 +-
 src/layers/misc/variance.cu                      |  4 ++--
 src/layers/regularizers/batch_normalization.cu   |  8 ++++----
 .../entrywise_batch_normalization.cu             | 12 ++++++------
 src/layers/regularizers/instance_norm.cu         |  8 ++++----
 src/layers/regularizers/layer_norm.cu            | 10 +++++-----
 src/layers/transform/concatenate.cu              |  4 ++--
 src/layers/transform/crop.cu                     |  4 ++--
 src/layers/transform/evaluation.cpp              |  4 ++--
 src/layers/transform/in_top_k.cu                 |  4 ++--
 src/layers/transform/slice.cu                    |  4 ++--
 src/layers/transform/sort.cu                     |  4 ++--
 src/layers/transform/split.cu                    | 10 +++++-----
 src/layers/transform/sum.cu                      | 10 +++++-----
 src/layers/transform/tessellate.cu               |  4 ++--
 src/models/model.cpp                             |  2 +-
 .../weight_regularization/l2.cpp                 |  2 +-
 .../weight_regularization/l2.cu                  |  4 ++--
 src/optimizers/adagrad.cu                        |  2 +-
 src/optimizers/adam.cu                           |  2 +-
 src/optimizers/rmsprop.cu                        |  2 +-
 src/optimizers/sgd.cu                            |  2 +-
 src/utils/cudnn.cpp                              |  8 ++++----
 src/utils/distconv.cpp                           |  4 ++--
 src/utils/lbann_library.cpp                      |  2 +-
 src/utils/profiling.cpp                          |  4 ++--
 src/weights/initializer.cpp                      |  2 +-
 62 files changed, 165 insertions(+), 161 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 807adfb27da..65a5e3efbf8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -188,16 +188,20 @@ set(LBANN_HAS_CEREAL ${CEREAL_FOUND})
 # The imported target is just called "cereal". Super.
 
 # Setup the linear algebra library
-find_package(Hydrogen 1.3.3 NO_MODULE QUIET
+find_package(Hydrogen 1.4.0 NO_MODULE QUIET
   HINTS ${Hydrogen_DIR} ${HYDROGEN_DIR} $ENV{Hydrogen_DIR} $ENV{HYDROGEN_DIR}
   PATH_SUFFIXES lib/cmake/hydrogen
   NO_DEFAULT_PATH)
 if (NOT Hydrogen_FOUND)
-  find_package(Hydrogen 1.3.3 NO_MODULE QUIET REQUIRED)
+  find_package(Hydrogen 1.4.0 NO_MODULE QUIET REQUIRED)
 endif ()
 message(STATUS "Found Hydrogen: ${Hydrogen_DIR}")
 set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND})
 
+if (_HYDROGEN_HAVE_ROCM)
+  message(FATAL_ERROR "ROCm not yet supported in LBANN.")
+endif ()
+
 # DiHydrogen and Distconv
 if (LBANN_WITH_DISTCONV AND NOT LBANN_WITH_DIHYDROGEN)
   message(FATAL_ERROR "Distconv requires DiHydrogen. Enable DiHydrogen to use Distconv.")
@@ -271,13 +275,13 @@ if (LBANN_WITH_ALUMINUM)
   if (NOT Aluminum_FOUND)
     message(WARNING
       "Using Aluminum without Hydrogen support may not be well-supported.")
-    find_package(Aluminum 0.3.0 NO_MODULE QUIET
+    find_package(Aluminum 0.4.0 NO_MODULE QUIET
       HINTS ${Aluminum_DIR} ${ALUMINUM_DIR} ${AL_DIR}
       $ENV{Aluminum_DIR} $ENV{ALUMINUM_DIR} $ENV{AL_DIR}
       PATH_SUFFIXES lib64/cmake/aluminum lib/cmake/aluminum
       NO_DEFAULT_PATH)
     if (NOT Aluminum_FOUND)
-      find_package(Aluminum 0.3.0 NO_MODULE QUIET)
+      find_package(Aluminum 0.4.0 NO_MODULE QUIET)
     endif ()
   endif ()
   set(LBANN_HAS_ALUMINUM ${Aluminum_FOUND})
diff --git a/bamboo/compiler_tests/build_script.sh b/bamboo/compiler_tests/build_script.sh
index 1ecdc393b57..333b99b3f41 100755
--- a/bamboo/compiler_tests/build_script.sh
+++ b/bamboo/compiler_tests/build_script.sh
@@ -23,7 +23,7 @@ MPI_DIR=${COMPILER_DIR}/${MPI_LIBRARY}
 # most are MPI-independent).
 DEPENDENCY_DIR=${MPI_DIR}
 
-export CMAKE_PREFIX_PATH=${COMMON_DEPENDENCY_DIR}/catch2:${COMMON_DEPENDENCY_DIR}/cereal:${COMMON_DEPENDENCY_DIR}/clara:${COMMON_DEPENDENCY_DIR}/cub:${COMMON_DEPENDENCY_DIR}/half:${DEPENDENCY_DIR}/aluminum:${DEPENDENCY_DIR}/cnpy:${DEPENDENCY_DIR}/conduit:${DEPENDENCY_DIR}/hdf5:${DEPENDENCY_DIR}/hydrogen:${DEPENDENCY_DIR}/jpeg-turbo:${DEPENDENCY_DIR}/nccl:${DEPENDENCY_DIR}/openblas:${DEPENDENCY_DIR}/opencv:${DEPENDENCY_DIR}/protobuf:${CMAKE_PREFIX_PATH}
+export CMAKE_PREFIX_PATH=${COMMON_DEPENDENCY_DIR}/catch2:${COMMON_DEPENDENCY_DIR}/cereal:${COMMON_DEPENDENCY_DIR}/clara:${COMMON_DEPENDENCY_DIR}/cub:${COMMON_DEPENDENCY_DIR}/half:${DEPENDENCY_DIR}/aluminum-0.4.0:${DEPENDENCY_DIR}/cnpy:${DEPENDENCY_DIR}/conduit:${DEPENDENCY_DIR}/hdf5:${DEPENDENCY_DIR}/hydrogen-1.4.0:${DEPENDENCY_DIR}/jpeg-turbo:${DEPENDENCY_DIR}/nccl:${DEPENDENCY_DIR}/openblas:${DEPENDENCY_DIR}/opencv:${DEPENDENCY_DIR}/protobuf:${CMAKE_PREFIX_PATH}
 
 if [ -e ${DEPENDENCY_DIR} ];
 then
diff --git a/include/lbann/layers/transform/concatenate.hpp b/include/lbann/layers/transform/concatenate.hpp
index 2b3e5091436..ae71499acd4 100644
--- a/include/lbann/layers/transform/concatenate.hpp
+++ b/include/lbann/layers/transform/concatenate.hpp
@@ -377,7 +377,7 @@ fp_compute() {
   dc::tensor::Concatenate(this->get_activations(0),
                           this->get_prev_activations(0),
                           this->get_prev_activations(1),
-                          El::GPUManager::Stream());
+                          hydrogen::cuda::GetDefaultStream());
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
@@ -386,7 +386,7 @@ bp_compute() {
   dc::tensor::Slice(this->get_error_signals(0),
                     this->get_error_signals(1),
                     this->get_prev_error_signals(0),
-                    El::GPUManager::Stream());
+                    hydrogen::cuda::GetDefaultStream());
 }
 #endif // LBANN_HAS_DISTCONV
 
diff --git a/include/lbann/utils/cuda.hpp b/include/lbann/utils/cuda.hpp
index d124487df3e..4168119156a 100644
--- a/include/lbann/utils/cuda.hpp
+++ b/include/lbann/utils/cuda.hpp
@@ -303,7 +303,7 @@ class allocator
   typedef typename parent_class::system_type system_type;
 
   /** Default constructor. */
-  allocator(cudaStream_t stream = El::GPUManager::Stream());
+  allocator(cudaStream_t stream = hydrogen::cuda::GetDefaultStream());
   /** Allocate GPU buffer. */
   pointer allocate(size_type size);
   /** Deallocate GPU buffer.
diff --git a/include/lbann/utils/impl/cuda.hpp b/include/lbann/utils/impl/cuda.hpp
index 8fa2bb79ff9..cb83e1e83c5 100644
--- a/include/lbann/utils/impl/cuda.hpp
+++ b/include/lbann/utils/impl/cuda.hpp
@@ -443,9 +443,9 @@ void apply_entrywise_unary_operator(
 
   // Launch CUDA kernel
   if (grid_dim > 0) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     entrywise_unary_operator_kernel<UnaryOp>
-      <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
         height, width, input.LockedBuffer(), input.LDim(),
         output.Buffer(), output.LDim());
   }
@@ -493,9 +493,9 @@ void apply_entrywise_binary_operator(
 
   // Launch CUDA kernel
   if (grid_dim > 0) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     entrywise_binary_operator_kernel<BinaryOp>
-      <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
         height, width,
         input1.LockedBuffer(), input1.LDim(),
         input2.LockedBuffer(), input2.LDim(),
diff --git a/src/callbacks/profiler.cpp b/src/callbacks/profiler.cpp
index d95cb7e05ba..aee74bef640 100644
--- a/src/callbacks/profiler.cpp
+++ b/src/callbacks/profiler.cpp
@@ -47,7 +47,7 @@ namespace callback {
 profiler::profiler(bool sync, bool skip_init) :
     callback_base(), m_sync(sync), m_skip_init(skip_init) {
 #ifdef LBANN_NVPROF
-  nvtxNameCudaStreamA(El::GPUManager::Stream(), "Hydrogen");
+  nvtxNameCudaStreamA(hydrogen::cuda::GetDefaultStream(), "Hydrogen");
 #endif
   if (!m_skip_init) {
     prof_start();
diff --git a/src/callbacks/sync_layers.cpp b/src/callbacks/sync_layers.cpp
index f2f3efdeb0f..fea7e50c55e 100644
--- a/src/callbacks/sync_layers.cpp
+++ b/src/callbacks/sync_layers.cpp
@@ -58,7 +58,7 @@ void sync_layers::on_backward_prop_end(model *m, Layer *l) {
 void sync_layers::do_sync(Layer *l) {
   #ifdef LBANN_HAS_CUDNN
   if (m_sync_gpus) {
-    El::GPUManager::SynchronizeDevice();
+    hydrogen::gpu::SynchronizeDevice();
   }
   #endif
   if (m_sync_mpi) {
diff --git a/src/comm.cpp b/src/comm.cpp
index ff07ee7ca9c..92f97b57931 100644
--- a/src/comm.cpp
+++ b/src/comm.cpp
@@ -167,7 +167,7 @@ void UpdateRequest(typename ::Al::NCCLBackend::req_type& req,
                    El::SyncInfo<El::Device::GPU> const& si) noexcept
 {
   if (req)
-    req->orig_stream = si.stream_;
+    req->orig_stream = si.Stream();
 }
 #endif // AL_HAS_NCCL
 
@@ -181,7 +181,7 @@ void UpdateRequest(typename ::Al::MPICUDABackend::req_type& req,
                    El::SyncInfo<El::Device::GPU> const& si) noexcept
 {
   if (req)
-    req->orig_stream = si.stream_;
+    req->orig_stream = si.Stream();
 }
 #endif // AL_HAS_MPI_CUDA
 #endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_ALUMINUM)
diff --git a/src/layers/activations/elu.cu b/src/layers/activations/elu.cu
index 7999168dd3d..b0bf373916d 100644
--- a/src/layers/activations/elu.cu
+++ b/src/layers/activations/elu.cu
@@ -96,7 +96,7 @@ void local_fp(TensorDataType alpha,
 
   // Launch CUDA kernel
   if (grid_dim > 0) {
-    fp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+    fp_kernel<<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
       alpha, height, width,
       input.LockedBuffer(), input.LDim(),
       output.Buffer(), output.LDim());
@@ -125,7 +125,7 @@ void local_bp(TensorDataType alpha,
 
   // Launch CUDA kernel
   if (grid_dim > 0) {
-    bp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+    bp_kernel<<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
       alpha, height, width,
       input.LockedBuffer(), input.LDim(),
       gradient_wrt_output.LockedBuffer(), gradient_wrt_output.LDim(),
diff --git a/src/layers/activations/leaky_relu.cu b/src/layers/activations/leaky_relu.cu
index 0a6ed4fd058..02d85c4e747 100644
--- a/src/layers/activations/leaky_relu.cu
+++ b/src/layers/activations/leaky_relu.cu
@@ -96,7 +96,7 @@ void local_fp(TensorDataType negative_slope,
 
   // Launch CUDA kernel
   if (grid_dim > 0) {
-    fp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+    fp_kernel<<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
       negative_slope, height, width,
       input.LockedBuffer(), input.LDim(),
       output.Buffer(), output.LDim());
@@ -125,7 +125,7 @@ void local_bp(TensorDataType negative_slope,
 
   // Launch CUDA kernel
   if (grid_dim > 0) {
-    bp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+    bp_kernel<<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
       negative_slope, height, width,
       input.LockedBuffer(), input.LDim(),
       gradient_wrt_output.LockedBuffer(), gradient_wrt_output.LDim(),
diff --git a/src/layers/activations/log_softmax.cu b/src/layers/activations/log_softmax.cu
index 8c93bd7a5a9..d4af472b7ca 100644
--- a/src/layers/activations/log_softmax.cu
+++ b/src/layers/activations/log_softmax.cu
@@ -301,8 +301,8 @@ void fp_compute_impl(log_softmax_layer<TensorDataType, data_layout::MODEL_PARALL
   const auto& local_width = local_input.Width();
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
-  auto&& event = El::GPUManager::Event();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
+  auto&& event = hydrogen::cuda::GetDefaultEvent();
   El::SyncInfo<El::Device::GPU> sync_info{stream, event};
 
   // Find max value in each column
@@ -383,8 +383,8 @@ void bp_compute_impl(log_softmax_layer<TensorDataType, data_layout::MODEL_PARALL
   const auto& local_width = local_output.Width();
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
-  auto&& event = El::GPUManager::Event();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
+  auto&& event = hydrogen::cuda::GetDefaultEvent();
   El::SyncInfo<El::Device::GPU> sync_info{stream, event};
 
   // Compute sum of entries in gradient w.r.t. output
diff --git a/src/layers/activations/softmax.cu b/src/layers/activations/softmax.cu
index 95965f53426..e7b38046d07 100644
--- a/src/layers/activations/softmax.cu
+++ b/src/layers/activations/softmax.cu
@@ -377,8 +377,8 @@ void fp_compute_impl(softmax_layer<TensorDataType, data_layout::MODEL_PARALLEL,
   const size_t local_width = local_input.Width();
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
-  auto&& event = El::GPUManager::Event();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
+  auto&& event = hydrogen::cuda::GetDefaultEvent();
   El::SyncInfo<El::Device::GPU> sync_info{stream, event};
 
   // Find max value in each column
@@ -462,8 +462,8 @@ void bp_compute_impl(softmax_layer<TensorDataType, data_layout::MODEL_PARALLEL,
   const auto& local_width = local_output.Width();
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
-  auto&& event = El::GPUManager::Event();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
+  auto&& event = hydrogen::cuda::GetDefaultEvent();
   El::SyncInfo<El::Device::GPU> sync_info{stream, event};
 
   // Compute dot(y,dy)
diff --git a/src/layers/data_type_distconv_adapter.cpp b/src/layers/data_type_distconv_adapter.cpp
index e744d505896..56163f0bbcb 100644
--- a/src/layers/data_type_distconv_adapter.cpp
+++ b/src/layers/data_type_distconv_adapter.cpp
@@ -327,7 +327,7 @@ setup_prev_activations_i(int index) const {
     const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
     t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
     assert0(t->allocate());
-    t->zero(El::GPUManager::Stream());
+    t->zero(hydrogen::cuda::GetDefaultStream());
   } else {
     // Create a shallow copy
     const auto &parent_activations =
@@ -421,7 +421,7 @@ setup_activations_i(int index) const {
   const auto local_shape = get_activations_local_shape(index);
   auto t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
   assert0(t->allocate());
-  t->zero(El::GPUManager::Stream());
+  t->zero(hydrogen::cuda::GetDefaultStream());
   return t;
 }
 
@@ -475,7 +475,7 @@ setup_prev_error_signals_i(int index) const {
     const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
     t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
     assert0(t->allocate());
-    t->zero(El::GPUManager::Stream());
+    t->zero(hydrogen::cuda::GetDefaultStream());
   } else {
     // Create a shallow copy
     const auto &child_error_signals =
@@ -540,7 +540,7 @@ setup_error_signals_i(int index) const {
   const auto local_shape = get_error_signals_local_shape(index);
   auto t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
   assert0(t->allocate());
-  t->zero(El::GPUManager::Stream());
+  t->zero(hydrogen::cuda::GetDefaultStream());
   return t;
 }
 
@@ -774,7 +774,7 @@ void data_type_distconv_adapter<TensorDataType>::ensure_prev_activations() {
     shuffler.shuffle_forward(
         get_original_prev_activations().get_const_base_ptr(),
         get_prev_activations().get_base_ptr(),
-        El::GPUManager::Stream());
+        hydrogen::cuda::GetDefaultStream());
   }
 }
 
@@ -796,7 +796,7 @@ void data_type_distconv_adapter<TensorDataType>::copy_out_activations() {
     shuffler.shuffle_forward(
         get_activations().get_const_base_ptr(),
         get_original_activations().get_base_ptr(),
-        El::GPUManager::Stream());
+        hydrogen::cuda::GetDefaultStream());
   }
 }
 
@@ -823,7 +823,7 @@ void data_type_distconv_adapter<TensorDataType>::ensure_prev_error_signals() {
     shuffler.shuffle_forward(
         get_original_prev_error_signals(i).get_const_base_ptr(),
         get_prev_error_signals(i).get_base_ptr(),
-        El::GPUManager::Stream());
+        hydrogen::cuda::GetDefaultStream());
   }
 }
 
@@ -846,7 +846,7 @@ void data_type_distconv_adapter<TensorDataType>::copy_out_error_signals() {
     shuffler.shuffle_forward(
         get_error_signals(i).get_const_base_ptr(),
         get_original_error_signals(i).get_base_ptr(),
-        El::GPUManager::Stream());
+        hydrogen::cuda::GetDefaultStream());
   }
 }
 
diff --git a/src/layers/data_type_layer.cpp b/src/layers/data_type_layer.cpp
index f2f6ad3c1b0..0f34fc53a00 100644
--- a/src/layers/data_type_layer.cpp
+++ b/src/layers/data_type_layer.cpp
@@ -122,7 +122,7 @@ void data_type_layer<TensorDataType>::forward_prop() {
 
 #if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
   // Synchronize GPUs and check for errors
-  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
+  if (using_gpus()) { hydrogen::gpu::SynchronizeDevice(); }
 #endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
 
 #ifdef LBANN_HAS_DISTCONV
@@ -143,7 +143,7 @@ void data_type_layer<TensorDataType>::forward_prop() {
 
 #if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
   // Synchronize GPUs and check for errors
-  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
+  if (using_gpus()) { hydrogen::gpu::SynchronizeDevice(); }
 #endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
 
   m_fp_time += get_time() - fp_start;
@@ -161,7 +161,7 @@ void data_type_layer<TensorDataType>::back_prop_impl_() {
 
 #if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
   // Synchronize GPUs and check for errors
-  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
+  if (using_gpus()) { hydrogen::gpu::SynchronizeDevice(); }
 #endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
 
 #ifdef LBANN_HAS_DISTCONV
@@ -182,7 +182,7 @@ void data_type_layer<TensorDataType>::back_prop_impl_() {
 
 #if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
   // Synchronize GPUs and check for errors
-  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
+  if (using_gpus()) { hydrogen::gpu::SynchronizeDevice(); }
 #endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
 
   m_bp_time += get_time() - bp_start;
diff --git a/src/layers/image/bilinear_resize.cu b/src/layers/image/bilinear_resize.cu
index d755373b67b..b2326d1e59d 100644
--- a/src/layers/image/bilinear_resize.cu
+++ b/src/layers/image/bilinear_resize.cu
@@ -147,7 +147,7 @@ void bilinear_resize_layer<TensorDataType, Layout, Device>::fp_compute() {
   // Launch CUDA kernel
   if (grid_dim > 0) {
     fp_kernel<block_dim>
-      <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
         num_samples, num_channels,
         input_height, input_width,
         local_input.LockedBuffer(), local_input.LDim(),
diff --git a/src/layers/io/input/input_layer.cpp b/src/layers/io/input/input_layer.cpp
index 6efae118ec2..eca519118da 100644
--- a/src/layers/io/input/input_layer.cpp
+++ b/src/layers/io/input/input_layer.cpp
@@ -165,7 +165,7 @@ setup_activations_i(int index) const {
     const auto local_shape = get_activations_local_shape(index);
     auto t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
     assert0(t->allocate());
-    t->zero(El::GPUManager::Stream());
+    t->zero(hydrogen::cuda::GetDefaultStream());
     return t;
   }
 }
@@ -252,7 +252,7 @@ template <typename TensorDataType, typename T_io_buffer,
 void input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::fp_compute() {
   auto &l = dynamic_cast<input_layer<
     TensorDataType, T_io_buffer, T_layout, Dev>&>(this->layer());
-  auto stream = El::GPUManager::Stream();
+  auto stream = hydrogen::cuda::GetDefaultStream();
   // Note that the mini-batch size of the data reader is not
   // actually the one for the current mini-batch as the mini-batch
   // index is already updated by fp_compute.
diff --git a/src/layers/learning/base_convolution.cpp b/src/layers/learning/base_convolution.cpp
index 68c3d42cbf7..151b385e0bf 100644
--- a/src/layers/learning/base_convolution.cpp
+++ b/src/layers/learning/base_convolution.cpp
@@ -1365,7 +1365,7 @@ void base_convolution_adapter<TensorDataType, Device>::bp_compute_convolution_fi
                             this->get_prev_error_signals(),
                             dst_scale, *m_bias_gradient, false);
     } else {
-      m_bias_gradient->scale(dst_scale, El::GPUManager::Stream());
+      m_bias_gradient->scale(dst_scale, hydrogen::cuda::GetDefaultStream());
     }
   }
 
@@ -1383,7 +1383,7 @@ void base_convolution_adapter<TensorDataType, Device>::bp_compute_convolution_fi
                             dst_scale,
                             *m_kernel_gradient, false);
   } else {
-    m_kernel_gradient->scale(dst_scale, El::GPUManager::Stream());
+    m_kernel_gradient->scale(dst_scale, hydrogen::cuda::GetDefaultStream());
   }
 }
 #endif // LBANN_HAS_DISTCONV
diff --git a/src/layers/learning/channelwise_scale_bias.cu b/src/layers/learning/channelwise_scale_bias.cu
index e603a36f5b2..54e3b07bc57 100644
--- a/src/layers/learning/channelwise_scale_bias.cu
+++ b/src/layers/learning/channelwise_scale_bias.cu
@@ -204,7 +204,7 @@ void channelwise_scale_bias_layer<TensorDataType, T_layout, Dev>::fp_compute() {
     grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
     grid_dims.z = num_channels;
     fp_kernel
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         num_channels, channel_size, local_width,
         local_input.LockedBuffer(), local_input.LDim(),
         local_output.Buffer(), local_output.LDim(),
@@ -254,7 +254,7 @@ void channelwise_scale_bias_layer<TensorDataType, T_layout, Dev>::bp_compute() {
     grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
     grid_dims.z = num_channels;
     bp_kernel<block_size_x, block_size_y>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         num_channels, channel_size, local_width,
         local_input.LockedBuffer(), local_input.LDim(),
         local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
diff --git a/src/layers/learning/embedding.cu b/src/layers/learning/embedding.cu
index 29e002be307..df780404ded 100644
--- a/src/layers/learning/embedding.cu
+++ b/src/layers/learning/embedding.cu
@@ -135,7 +135,7 @@ void embedding_layer<TensorDataType, T_layout, Dev>::fp_compute() {
     grid_dims.x = (this->m_embedding_dim + block_size - 1) / block_size;
     grid_dims.y = input_size;
     grid_dims.z = local_mini_batch_size;
-    fp_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    fp_kernel<<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
       this->m_num_embeddings,
       this->m_embedding_dim,
       input_size,
@@ -177,7 +177,7 @@ void embedding_layer<TensorDataType, T_layout, Dev>::bp_compute() {
     grid_dims.x = (this->m_embedding_dim + block_size - 1) / block_size;
     grid_dims.y = input_size;
     grid_dims.z = local_mini_batch_size;
-    bp_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    bp_kernel<<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
       this->m_num_embeddings,
       this->m_embedding_dim,
       input_size,
diff --git a/src/layers/learning/entrywise_scale_bias.cu b/src/layers/learning/entrywise_scale_bias.cu
index 695986244cc..d16dd3b3857 100644
--- a/src/layers/learning/entrywise_scale_bias.cu
+++ b/src/layers/learning/entrywise_scale_bias.cu
@@ -118,7 +118,7 @@ void fp_impl(
     block_dims.y = block_size_y;
     grid_dims.x = (local_height + block_size_x - 1) / block_size_x;
     grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
-    fp_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    fp_kernel<<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height, local_width,
         local_input.LockedBuffer(), local_input.LDim(),
         local_output.Buffer(), local_output.LDim(),
@@ -153,7 +153,7 @@ void bp_impl(
     dim3 block_dims, grid_dims;
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
-    bp_kernel <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    bp_kernel <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
       local_height, local_width,
       local_input.LockedBuffer(), local_input.LDim(),
       local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
diff --git a/src/layers/loss/categorical_accuracy.cu b/src/layers/loss/categorical_accuracy.cu
index 5b1057887e6..b70a1f6b5ad 100644
--- a/src/layers/loss/categorical_accuracy.cu
+++ b/src/layers/loss/categorical_accuracy.cu
@@ -166,8 +166,8 @@ void fp_gpu(lbann_comm& comm,
   const auto& col_comm_root = loss.RowOwner(0);
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
-  auto&& event = El::GPUManager::Event();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
+  auto&& event = hydrogen::cuda::GetDefaultEvent();
   El::SyncInfo<El::Device::GPU> sync_info{stream, event};
 
   // Initialize CUDA threads/blocks for reduction kernel
diff --git a/src/layers/loss/cross_entropy.cu b/src/layers/loss/cross_entropy.cu
index a8980e621ba..1a9aa30c83e 100644
--- a/src/layers/loss/cross_entropy.cu
+++ b/src/layers/loss/cross_entropy.cu
@@ -91,9 +91,9 @@ void local_fp_gpu(const El::AbstractMatrix<TensorDataType>& local_prediction,
     block_dims.x = block_size;
     grid_dims.x = (height + block_size - 1) / block_size;
     grid_dims.y = width;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     fp_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         height, width,
         local_prediction.LockedBuffer(), local_prediction.LDim(),
         local_ground_truth.LockedBuffer(), local_ground_truth.LDim(),
@@ -147,9 +147,9 @@ void local_bp_gpu(const El::AbstractMatrix<TensorDataType>& local_prediction,
     block_dims.x = block_size;
     grid_dims.x = (height + block_size - 1) / block_size;
     grid_dims.y = width;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     bp_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         height, width,
         local_prediction.LockedBuffer(), local_prediction.LDim(),
         local_ground_truth.LockedBuffer(), local_ground_truth.LDim(),
diff --git a/src/layers/loss/entrywise.cu b/src/layers/loss/entrywise.cu
index bddaf92b53b..deecafdee7f 100644
--- a/src/layers/loss/entrywise.cu
+++ b/src/layers/loss/entrywise.cu
@@ -92,9 +92,9 @@ void apply_binary_backprop_operator(
 
   // Launch CUDA kernel
   if (grid_dim > 0) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     binary_backprop_operator_kernel<Op>
-      <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
         height, width,
         x1.LockedBuffer(), x1.LDim(),
         x2.LockedBuffer(), x2.LDim(),
diff --git a/src/layers/loss/l1_norm.cu b/src/layers/loss/l1_norm.cu
index 8e5db8ef411..63dabae361d 100644
--- a/src/layers/loss/l1_norm.cu
+++ b/src/layers/loss/l1_norm.cu
@@ -84,9 +84,9 @@ void local_fp_gpu(const El::AbstractMatrix<TensorDataType>& local_input,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     fp_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height, local_width,
         local_input.LockedBuffer(), local_input.LDim(),
         local_contribution.Buffer());
@@ -132,9 +132,9 @@ void local_bp_gpu(const El::AbstractMatrix<TensorDataType>& local_input,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     bp_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height, local_width,
         local_input.LockedBuffer(), local_input.LDim(),
         local_gradient_wrt_output.LockedBuffer(),
diff --git a/src/layers/loss/l2_norm2.cu b/src/layers/loss/l2_norm2.cu
index 916375a776b..78c4823b0cf 100644
--- a/src/layers/loss/l2_norm2.cu
+++ b/src/layers/loss/l2_norm2.cu
@@ -84,9 +84,9 @@ void local_fp_gpu(const El::AbstractMatrix<TensorDataType>& local_input,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     fp_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height, local_width,
         local_input.LockedBuffer(), local_input.LDim(),
         local_contribution.Buffer());
@@ -125,9 +125,9 @@ void local_bp_gpu(const El::AbstractMatrix<TensorDataType>& local_input,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     bp_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height, local_width,
         local_input.LockedBuffer(), local_input.LDim(),
         local_gradient_wrt_output.LockedBuffer(),
diff --git a/src/layers/loss/mean_absolute_error.cu b/src/layers/loss/mean_absolute_error.cu
index 0b591b92836..ed8332a1045 100644
--- a/src/layers/loss/mean_absolute_error.cu
+++ b/src/layers/loss/mean_absolute_error.cu
@@ -90,9 +90,9 @@ void local_fp_gpu(El::Int height,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     fp_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         height, local_height, local_width,
         local_prediction.LockedBuffer(), local_prediction.LDim(),
         local_ground_truth.LockedBuffer(), local_ground_truth.LDim(),
@@ -157,9 +157,9 @@ void local_bp_gpu(El::Int height,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     bp_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         height, local_height, local_width,
         local_prediction.LockedBuffer(), local_prediction.LDim(),
         local_ground_truth.LockedBuffer(), local_ground_truth.LDim(),
diff --git a/src/layers/loss/mean_squared_error.cu b/src/layers/loss/mean_squared_error.cu
index 6a404cb7fe9..5a57666299e 100644
--- a/src/layers/loss/mean_squared_error.cu
+++ b/src/layers/loss/mean_squared_error.cu
@@ -90,9 +90,9 @@ void local_fp_gpu(El::Int height,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     fp_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         height, local_height, local_width,
         local_prediction.LockedBuffer(), local_prediction.LDim(),
         local_ground_truth.LockedBuffer(), local_ground_truth.LDim(),
@@ -148,9 +148,9 @@ void local_bp_gpu(El::Int height,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     bp_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         height, local_height, local_width,
         local_prediction.LockedBuffer(), local_prediction.LDim(),
         local_ground_truth.LockedBuffer(), local_ground_truth.LDim(),
diff --git a/src/layers/loss/top_k_categorical_accuracy.cu b/src/layers/loss/top_k_categorical_accuracy.cu
index a388e1ab6ff..472791d688f 100644
--- a/src/layers/loss/top_k_categorical_accuracy.cu
+++ b/src/layers/loss/top_k_categorical_accuracy.cu
@@ -201,8 +201,8 @@ void fp_gpu(lbann_comm& comm,
   const auto& col_comm_root = loss.RowOwner(0);
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
-  auto&& event = El::GPUManager::Event();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
+  auto&& event = hydrogen::cuda::GetDefaultEvent();
   El::SyncInfo<El::Device::GPU> syncInfo{stream, event};
   cuda::thrust::allocator<> alloc(stream);
 
diff --git a/src/layers/math/binary.cu b/src/layers/math/binary.cu
index 3367334a08c..23681272089 100644
--- a/src/layers/math/binary.cu
+++ b/src/layers/math/binary.cu
@@ -93,9 +93,9 @@ void apply_binary_backprop_operator(const El::AbstractMatrix<TensorDataType>& x1
 
   // Launch CUDA kernel
   if (grid_dim > 0) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     binary_backprop_operator_kernel<BinaryBackPropOperator>
-      <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
         height, width,
         x1.LockedBuffer(), x1.LDim(),
         x2.LockedBuffer(), x2.LDim(),
diff --git a/src/layers/math/clamp.cu b/src/layers/math/clamp.cu
index 13947b016c6..04a0583419e 100644
--- a/src/layers/math/clamp.cu
+++ b/src/layers/math/clamp.cu
@@ -101,7 +101,7 @@ void local_fp(TensorDataType min,
 
   // Launch CUDA kernel
   if (grid_dim > 0) {
-    fp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+    fp_kernel<<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
       min, max, height, width,
       input.LockedBuffer(), input.LDim(),
       output.Buffer(), output.LDim());
@@ -131,7 +131,7 @@ void local_bp(TensorDataType min,
 
   // Launch CUDA kernel
   if (grid_dim > 0) {
-    bp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+    bp_kernel<<<grid_dim, block_dim, 0, hydrogen::cuda::GetDefaultStream()>>>(
       min, max, height, width,
       input.LockedBuffer(), input.LDim(),
       gradient_wrt_output.LockedBuffer(), gradient_wrt_output.LDim(),
diff --git a/src/layers/math/matmul.cpp b/src/layers/math/matmul.cpp
index 4baabf62f03..d529aa7b729 100644
--- a/src/layers/math/matmul.cpp
+++ b/src/layers/math/matmul.cpp
@@ -173,7 +173,7 @@ void fp_compute_impl(matmul_layer<TensorDataType, data_layout::DATA_PARALLEL,El:
   // Compute matrix multiplication for each mini-batch sample
   // Note: cuBLAS expects matrices in Fortran layout while LBANN
   // tensors are in C layout.
-  auto&& handle = El::GPUManager::cuBLASHandle();
+  auto&& handle = hydrogen::cublas::GetLibraryHandle();
   cublas::gemm_strided_batched(
     handle,
     transpose_input1 ? CUBLAS_OP_T : CUBLAS_OP_N,
@@ -223,7 +223,7 @@ void bp_compute_impl(matmul_layer<TensorDataType, data_layout::DATA_PARALLEL,El:
   // Compute gradients for each mini-batch sample
   // Note: cuBLAS expects matrices in Fortran layout while LBANN
   // tensors are in C layout.
-  auto&& handle = El::GPUManager::cuBLASHandle();
+  auto&& handle = hydrogen::cublas::GetLibraryHandle();
   if (transpose_input0) {
     cublas::gemm_strided_batched(
       handle,
diff --git a/src/layers/misc/channelwise_mean.cu b/src/layers/misc/channelwise_mean.cu
index 30da86c7dba..86ba0f7c01b 100644
--- a/src/layers/misc/channelwise_mean.cu
+++ b/src/layers/misc/channelwise_mean.cu
@@ -139,7 +139,7 @@ void channelwise_mean_layer<TensorDataType, Layout, Device>::fp_compute() {
     grid_dims.y = num_channels;
     grid_dims.z = local_width;
     mean_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         num_channels, channel_size, local_width,
         local_input.LockedBuffer(), local_input.LDim(),
         local_output.Buffer(), local_output.LDim());
@@ -171,7 +171,7 @@ void channelwise_mean_layer<TensorDataType, Layout, Device>::bp_compute() {
     grid_dims.x = (channel_size + block_size - 1) / block_size;
     grid_dims.y = num_channels;
     grid_dims.z = local_width;
-    backprop_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    backprop_kernel<<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         num_channels, channel_size, local_width,
         local_gradient_wrt_output.LockedBuffer(),
         local_gradient_wrt_output.LDim(),
diff --git a/src/layers/misc/channelwise_softmax.cu b/src/layers/misc/channelwise_softmax.cu
index 641d7708e7e..842d9f60da8 100644
--- a/src/layers/misc/channelwise_softmax.cu
+++ b/src/layers/misc/channelwise_softmax.cu
@@ -235,7 +235,7 @@ void fp_impl(size_t num_channels,
     grid_dims.z = local_mini_batch_size;
     LocalMat maxvals(grid_dims.x * num_channels, local_mini_batch_size);
     fp_max_kernel<TensorDataType,block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         {local_mini_batch_size, num_channels, channel_size},
         local_input.LockedBuffer(),
         {static_cast<size_t>(local_input.LDim()), channel_size, 1},
@@ -247,7 +247,7 @@ void fp_impl(size_t num_channels,
       const LocalMat prev_maxvals(std::move(maxvals));
       maxvals.Resize(grid_dims.x * num_channels, local_mini_batch_size);
       fp_max_kernel<TensorDataType,block_size>
-        <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
           {local_mini_batch_size, num_channels, prev_dim},
           prev_maxvals.LockedBuffer(),
           {static_cast<size_t>(prev_maxvals.LDim()), prev_dim, 1},
@@ -268,7 +268,7 @@ void fp_impl(size_t num_channels,
     grid_dims.y = num_channels;
     grid_dims.z = local_mini_batch_size;
     fp_denom_kernel<TensorDataType,block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         {local_mini_batch_size, num_channels, channel_size},
         local_input.LockedBuffer(),
         {static_cast<size_t>(local_input.LDim()), channel_size, 1},
@@ -285,7 +285,7 @@ void fp_impl(size_t num_channels,
     grid_dims.y = num_channels;
     grid_dims.z = local_mini_batch_size;
     fp_output_kernel<TensorDataType>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         {local_mini_batch_size, num_channels, channel_size},
         local_input.LockedBuffer(),
         {static_cast<size_t>(local_input.LDim()), channel_size, 1},
@@ -446,7 +446,7 @@ void bp_impl(size_t num_channels,
     grid_dims.y = num_channels;
     grid_dims.z = local_mini_batch_size;
     bp_y_dot_dy_kernel<TensorDataType,block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         {local_mini_batch_size, num_channels, channel_size},
         local_output.LockedBuffer(),
         {static_cast<size_t>(local_output.LDim()), channel_size, 1},
@@ -464,7 +464,7 @@ void bp_impl(size_t num_channels,
     grid_dims.y = num_channels;
     grid_dims.z = local_mini_batch_size;
     bp_input_grad_kernel<TensorDataType>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         {local_mini_batch_size, num_channels, channel_size},
         local_output.LockedBuffer(),
         {static_cast<size_t>(local_output.LDim()), channel_size, 1},
diff --git a/src/layers/misc/covariance.cu b/src/layers/misc/covariance.cu
index 91c906b676c..9a488d18955 100644
--- a/src/layers/misc/covariance.cu
+++ b/src/layers/misc/covariance.cu
@@ -209,7 +209,7 @@ void fp_gpu(const El::AbstractDistMatrix<TensorDataType>& input0,
     grid_dims.y = local_width;
     const auto& scale = El::TypeTraits<TensorDataType>::One() / TensorDataType(height);
     mean_contribution_kernel<TensorDataType, block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height, local_width, scale,
         local_input0.LockedBuffer(), local_input0.LDim(),
         local_input1.LockedBuffer(), local_input1.LDim(),
@@ -229,7 +229,7 @@ void fp_gpu(const El::AbstractDistMatrix<TensorDataType>& input0,
     grid_dims.y = local_width;
     const auto& scale = El::TypeTraits<TensorDataType>::One() / (biased ? TensorDataType(height) : TensorDataType(height - 1));
     covariance_contribution_kernel<TensorDataType, block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height, local_width, scale,
         local_input0.LockedBuffer(), local_input0.LDim(),
         local_input1.LockedBuffer(), local_input1.LDim(),
@@ -276,7 +276,7 @@ void bp_gpu(const El::AbstractDistMatrix<TensorDataType>& input0,
   El::Int grid_size = (local_height * local_width + block_size - 1) / block_size;
   if (grid_size > 0) {
     covariance_backprop_kernel<TensorDataType>
-      <<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
+      <<<grid_size, block_size, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height, local_width, scale,
         local_workspace.LockedBuffer(),
         local_input0.LockedBuffer(), local_input0.LDim(),
diff --git a/src/layers/misc/dist_embedding.cu b/src/layers/misc/dist_embedding.cu
index 557cc04a499..d2a7ee2e895 100644
--- a/src/layers/misc/dist_embedding.cu
+++ b/src/layers/misc/dist_embedding.cu
@@ -341,7 +341,7 @@ void dist_embedding_layer<TensorDataType,Layout,Device>::fp_compute() {
   const size_t local_mini_batch_size = local_input.Width();
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
   nvshmem::initialize();
 
   // Barrier to handle gradient checking
@@ -523,7 +523,7 @@ void dist_embedding_layer<TensorDataType,Layout,Device>::bp_compute() {
   const size_t local_mini_batch_size = local_output_grad.Width();
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
 
   // Synchronize non-blocking barrier
   // Note: Make sure NVSHMEM workspaces are ready to recieve gradients.
@@ -653,7 +653,7 @@ void dist_embedding_layer<TensorDataType,Layout,Device>::apply_sparse_sgd_step(
   LocalMat& local_embeddings) {
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
 
   // Synchronize non-blocking barrier
   // Note: Make sure gradients have been received.
diff --git a/src/layers/misc/one_hot.cu b/src/layers/misc/one_hot.cu
index 2ebfb92a9a8..8053ac71528 100644
--- a/src/layers/misc/one_hot.cu
+++ b/src/layers/misc/one_hot.cu
@@ -75,7 +75,7 @@ void one_hot_layer<TensorDataType, Layout, Device>::fp_compute() {
     const size_t local_width = local_output.Width();
     constexpr size_t block_size = 64;
     const size_t grid_size = (local_width + block_size - 1) / block_size;
-    fp_kernel<<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
+    fp_kernel<<<grid_size, block_size, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height,
         local_width,
         local_input.LockedBuffer(),
diff --git a/src/layers/misc/variance.cu b/src/layers/misc/variance.cu
index 8c70b7bb9aa..0a7ea7b08ef 100644
--- a/src/layers/misc/variance.cu
+++ b/src/layers/misc/variance.cu
@@ -150,7 +150,7 @@ void fp_gpu(const El::AbstractDistMatrix<TensorDataType>& input,
     grid_dims.y = local_width;
     const auto& scale = El::TypeTraits<TensorDataType>::One() / (biased ? TensorDataType(height) : TensorDataType(height - 1));
     variance_contribution_kernel<TensorDataType, block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height, local_width, scale,
         local_input.LockedBuffer(), local_input.LDim(),
         local_means.LockedBuffer(),
@@ -192,7 +192,7 @@ void bp_gpu(const El::AbstractDistMatrix<TensorDataType>& input,
   El::Int grid_size = (local_height * local_width + block_size - 1) / block_size;
   if (grid_size > 0) {
     variance_backprop_kernel<TensorDataType>
-      <<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
+      <<<grid_size, block_size, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height, local_width, scale,
         local_workspace.LockedBuffer(),
         local_input.LockedBuffer(), local_input.LDim(),
diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu
index c81cb076fe2..4f6a44e5a30 100644
--- a/src/layers/regularizers/batch_normalization.cu
+++ b/src/layers/regularizers/batch_normalization.cu
@@ -405,8 +405,8 @@ void batch_normalization_layer<TensorDataType, T_layout, Dev>::fp_compute() {
   const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training;
 
   // CUDA objects
-  CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-  auto&& stream = El::GPUManager::Stream();
+  CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
 
   // Matrices
   const auto& input = this->get_prev_activations();
@@ -523,8 +523,8 @@ void batch_normalization_layer<TensorDataType, T_layout, Dev>::bp_compute() {
   const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training;
 
   // CUDA objects
-  CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-  auto&& stream = El::GPUManager::Stream();
+  CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
 
   // Matrices
   const auto& local_scale = this->weights_values(0).LockedMatrix();
diff --git a/src/layers/regularizers/entrywise_batch_normalization.cu b/src/layers/regularizers/entrywise_batch_normalization.cu
index 3acd10ba1ab..ba0133a3148 100644
--- a/src/layers/regularizers/entrywise_batch_normalization.cu
+++ b/src/layers/regularizers/entrywise_batch_normalization.cu
@@ -127,7 +127,7 @@ void compute_batch_statistics(lbann_comm& comm,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     row_sums_kernel<TensorDataType>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height,
         local_width,
         local_input.LockedBuffer(),
@@ -155,7 +155,7 @@ void compute_batch_statistics(lbann_comm& comm,
       block_dims.x = block_size;
       grid_dims.x = (local_height + block_size - 1) / block_size;
       compute_statistics_kernel<TensorDataType>
-        <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
           local_height,
           statistics_count,
           decay,
@@ -219,7 +219,7 @@ void apply_batchnorm(DataType epsilon,
     grid_dims.x = (local_height + block_size_x - 1) / block_size_x;
     grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
     batchnorm_kernel<TensorDataType>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height,
         local_width,
         epsilon,
@@ -419,7 +419,7 @@ void bp_training_impl(lbann_comm& comm,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     bp_training_stats_gradient_kernel<TensorDataType>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height,
         local_width,
         epsilon,
@@ -452,7 +452,7 @@ void bp_training_impl(lbann_comm& comm,
     grid_dims.x = (local_height + block_size_x - 1) / block_size_x;
     grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
     bp_training_error_signal_kernel
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height,
         local_width,
         epsilon,
@@ -530,7 +530,7 @@ void bp_inference_impl(DataType epsilon,
     grid_dims.x = (local_height + block_size_x - 1) / block_size_x;
     grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
     bp_inference_kernel<TensorDataType>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_height,
         local_width,
         epsilon,
diff --git a/src/layers/regularizers/instance_norm.cu b/src/layers/regularizers/instance_norm.cu
index f1b0a7f4775..b256d6c9b5d 100644
--- a/src/layers/regularizers/instance_norm.cu
+++ b/src/layers/regularizers/instance_norm.cu
@@ -208,7 +208,7 @@ void fp_impl(lbann_comm& comm,
     grid_dims.y = num_channels;
     grid_dims.z = local_mini_batch_size;
     fp_sums_kernel<TensorDataType,block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_mini_batch_size, num_channels, channel_size,
         local_input.LockedBuffer(), local_input.LDim(),
         local_sums.Buffer(), local_sums.LDim(),
@@ -223,7 +223,7 @@ void fp_impl(lbann_comm& comm,
     grid_dims.x = (channel_size + block_size - 1) / block_size;
     grid_dims.y = num_channels;
     grid_dims.z = local_mini_batch_size;
-    fp_output_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    fp_output_kernel<<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
       local_mini_batch_size, num_channels, channel_size, epsilon,
       local_input.LockedBuffer(), local_input.LDim(),
       local_output.Buffer(), local_output.LDim(),
@@ -454,7 +454,7 @@ void bp_impl(lbann_comm& comm,
     grid_dims.y = num_channels;
     grid_dims.z = local_mini_batch_size;
     bp_statistics_grad_kernel<TensorDataType,block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_mini_batch_size, num_channels, channel_size, epsilon,
         local_input.LockedBuffer(), local_input.LDim(),
         local_output_grad.LockedBuffer(), local_output_grad.LDim(),
@@ -473,7 +473,7 @@ void bp_impl(lbann_comm& comm,
     grid_dims.y = num_channels;
     grid_dims.z = local_mini_batch_size;
     bp_input_grad_kernel
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_mini_batch_size, num_channels, channel_size, epsilon,
         local_input.LockedBuffer(), local_input.LDim(),
         local_output_grad.LockedBuffer(), local_output_grad.LDim(),
diff --git a/src/layers/regularizers/layer_norm.cu b/src/layers/regularizers/layer_norm.cu
index 11b55d7cce2..cd2ec15072f 100644
--- a/src/layers/regularizers/layer_norm.cu
+++ b/src/layers/regularizers/layer_norm.cu
@@ -202,7 +202,7 @@ void fp_impl(lbann_comm& comm,
     block_dims.x = block_size;
     grid_dims.x = (local_sample_size + block_size - 1) / block_size;
     grid_dims.y = local_num_samples;
-    fp_sums_kernel<block_size><<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    fp_sums_kernel<block_size><<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
       local_num_samples, local_sample_size,
       local_input.LockedBuffer(), local_input.LDim(),
       local_means.Buffer(), local_means.LDim(),
@@ -220,7 +220,7 @@ void fp_impl(lbann_comm& comm,
     dim3 block_dims, grid_dims;
     block_dims.x = block_size;
     grid_dims.x = (local_num_samples + block_size - 1) / block_size;
-    fp_statistics_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    fp_statistics_kernel<<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
       sample_size, local_num_samples,
       local_means.Buffer(), local_means.LDim(),
       local_vars.Buffer(), local_vars.LDim());
@@ -233,7 +233,7 @@ void fp_impl(lbann_comm& comm,
     block_dims.x = block_size;
     grid_dims.x = (local_sample_size + block_size - 1) / block_size;
     grid_dims.y = local_num_samples;
-    fp_output_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    fp_output_kernel<<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
       local_num_samples, local_sample_size, epsilon,
       local_input.LockedBuffer(), local_input.LDim(),
       local_output.Buffer(), local_output.LDim(),
@@ -407,7 +407,7 @@ void bp_impl(lbann_comm& comm,
     grid_dims.x = (local_sample_size + block_size - 1) / block_size;
     grid_dims.y = local_num_samples;
     bp_statistics_grad_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         local_num_samples, local_sample_size, epsilon,
         local_input.LockedBuffer(), local_input.LDim(),
         local_output_grad.LockedBuffer(), local_output_grad.LDim(),
@@ -428,7 +428,7 @@ void bp_impl(lbann_comm& comm,
     grid_dims.x = (local_sample_size + block_size - 1) / block_size;
     grid_dims.y = local_num_samples;
     bp_input_grad_kernel
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      <<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
         sample_size, local_num_samples, local_sample_size, epsilon,
         local_input.LockedBuffer(), local_input.LDim(),
         local_output_grad.LockedBuffer(), local_output_grad.LDim(),
diff --git a/src/layers/transform/concatenate.cu b/src/layers/transform/concatenate.cu
index 0733cc9bb09..9b729f1606d 100644
--- a/src/layers/transform/concatenate.cu
+++ b/src/layers/transform/concatenate.cu
@@ -195,7 +195,7 @@ void fp_compute_impl(
   auto& output = l.get_activations();
   auto& local_output = dynamic_cast<LocalMatrix&>(output.Matrix());
   auto&& sync_info = El::SyncInfoFromMatrix(local_output);
-  auto&& stream = sync_info.stream_;
+  auto&& stream = sync_info.Stream();
 
   // Get dimensions and strides for each input tensor
   const size_t num_inputs = l.get_num_parents();
@@ -348,7 +348,7 @@ void bp_compute_impl(
   const auto& output_grad = l.get_prev_error_signals();
   auto& local_output_grad = dynamic_cast<const LocalMatrix&>(output_grad.LockedMatrix());
   auto&& sync_info = El::SyncInfoFromMatrix(local_output_grad);
-  auto&& stream = sync_info.stream_;
+  auto&& stream = sync_info.Stream();
 
   // Get dimensions and strides for each input gradient tensor
   const size_t num_inputs = l.get_num_parents();
diff --git a/src/layers/transform/crop.cu b/src/layers/transform/crop.cu
index b84e6d364c0..6029c179e94 100644
--- a/src/layers/transform/crop.cu
+++ b/src/layers/transform/crop.cu
@@ -185,7 +185,7 @@ void crop_layer<TensorDataType, T_layout, Dev>::fp_compute_3d() {
     block_dims.x = block_size;
     grid_dims.x = (output_size + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    fp_compute_3d_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    fp_compute_3d_kernel<<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
       input_dims[2], input_dims[1], input_dims[0],
       output_dims[2], output_dims[1], output_dims[0],
       local_width,
@@ -221,7 +221,7 @@ void crop_layer<TensorDataType, T_layout, Dev>::bp_compute_3d() {
     block_dims.x = block_size;
     grid_dims.x = (output_size + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    bp_compute_3d_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    bp_compute_3d_kernel<<<grid_dims, block_dims, 0, hydrogen::cuda::GetDefaultStream()>>>(
       input_dims[2], input_dims[1], input_dims[0],
       output_dims[2], output_dims[1], output_dims[0],
       local_width,
diff --git a/src/layers/transform/evaluation.cpp b/src/layers/transform/evaluation.cpp
index 10ffb3f0bf9..6ff4e773f00 100644
--- a/src/layers/transform/evaluation.cpp
+++ b/src/layers/transform/evaluation.cpp
@@ -103,8 +103,8 @@ void fp_gpu(lbann_comm& comm,
   ones_d.SetMemoryMode(1); // Use CUB GPU memory pool
 #endif // HYDROGEN_HAVE_CUB
   sum_d.Resize(1, 1);
-  auto&& handle = El::GPUManager::cuBLASHandle();
-  auto&& stream = El::GPUManager::Stream();
+  auto&& handle = hydrogen::cublas::GetLibraryHandle();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
   CHECK_CUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
 
   // Compute sum of local input matrix entries
diff --git a/src/layers/transform/in_top_k.cu b/src/layers/transform/in_top_k.cu
index 35d4e073bf6..1d02eca89db 100644
--- a/src/layers/transform/in_top_k.cu
+++ b/src/layers/transform/in_top_k.cu
@@ -188,8 +188,8 @@ void fp_gpu(lbann_comm& comm,
   const auto& col_comm_size = El::mpi::Size(col_comm);
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
-  auto&& event = El::GPUManager::Event();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
+  auto&& event = hydrogen::cuda::GetDefaultEvent();
   cuda::thrust::allocator<> alloc(stream);
 
   // Find top-k entries in each column of local prediction matrix
diff --git a/src/layers/transform/slice.cu b/src/layers/transform/slice.cu
index f1e478632fa..cdba394b5d8 100644
--- a/src/layers/transform/slice.cu
+++ b/src/layers/transform/slice.cu
@@ -192,7 +192,7 @@ void fp_compute_impl(
   const auto& input = l.get_prev_activations();
   const auto& local_input = dynamic_cast<const LocalMatrix&>(input.LockedMatrix());
   auto&& sync_info = El::SyncInfoFromMatrix(local_input);
-  auto&& stream = sync_info.stream_;
+  auto&& stream = sync_info.Stream();
 
   // Get dimensions and strides for each output tensor
   const size_t num_outputs = l.get_num_children();
@@ -341,7 +341,7 @@ void bp_compute_impl(
   auto& input_grad = l.get_error_signals();
   auto& local_input_grad = dynamic_cast<LocalMatrix&>(input_grad.Matrix());
   auto&& sync_info = El::SyncInfoFromMatrix(local_input_grad);
-  auto&& stream = sync_info.stream_;
+  auto&& stream = sync_info.Stream();
 
   // Get dimensions and strides for each output gradient tensor
   const size_t num_outputs = l.get_num_children();
diff --git a/src/layers/transform/sort.cu b/src/layers/transform/sort.cu
index 4707459a456..987d3b24dce 100644
--- a/src/layers/transform/sort.cu
+++ b/src/layers/transform/sort.cu
@@ -48,7 +48,7 @@ void sort_layer<TensorDataType, T_layout, Dev>::fp_compute() {
   const auto& local_width = local_input.Width();
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
   cuda::thrust::allocator<> alloc(stream);
 
   // Sort each matrix column
@@ -82,7 +82,7 @@ void sort_layer<TensorDataType, T_layout, Dev>::bp_compute() {
   const auto& local_width = local_gradient_wrt_input.Width();
 
   // GPU objects
-  auto&& stream = El::GPUManager::Stream();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
   cuda::thrust::allocator<> alloc(stream);
 
   // Scatter gradients based on sorted indices
diff --git a/src/layers/transform/split.cu b/src/layers/transform/split.cu
index e964b2ee839..615a4dbe058 100644
--- a/src/layers/transform/split.cu
+++ b/src/layers/transform/split.cu
@@ -71,29 +71,29 @@ void split_distconv_adapter<TensorDataType, Layout, Dev>::bp_compute() {
   auto &error_signals = this->get_error_signals(0);
   switch (this->layer().get_num_children()) {
     case 0:
-      error_signals.zero(El::GPUManager::Stream());
+      error_signals.zero(hydrogen::cuda::GetDefaultStream());
       break;
     case 1:
       dc::tensor::Copy(error_signals,
                        this->get_prev_error_signals(0),
-                       El::GPUManager::Stream());
+                       hydrogen::cuda::GetDefaultStream());
       break;
     case 2:
       dc::tensor::Transform(error_signals,
                             this->get_prev_error_signals(0),
                             this->get_prev_error_signals(1),
                             sum_op<TensorDataType>(),
-                            El::GPUManager::Stream());
+                            hydrogen::cuda::GetDefaultStream());
       break;
     default:
       dc::tensor::Copy(error_signals,
                        this->get_prev_error_signals(1),
-                       El::GPUManager::Stream());
+                       hydrogen::cuda::GetDefaultStream());
       for (int i = 1; i < this->layer().get_num_children(); ++i) {
         const auto &prev_error = this->get_prev_error_signals(i);
         dc::tensor::Transform(error_signals, prev_error,
                               accumulate_op<TensorDataType>(),
-                              El::GPUManager::Stream());
+                              hydrogen::cuda::GetDefaultStream());
       }
   }
   return;
diff --git a/src/layers/transform/sum.cu b/src/layers/transform/sum.cu
index 4aaa819dd76..de5158a79a7 100644
--- a/src/layers/transform/sum.cu
+++ b/src/layers/transform/sum.cu
@@ -67,11 +67,11 @@ void sum_distconv_adapter<TensorDataType, Layout, Dev>::fp_compute() {
   auto &activations = this->get_activations();
   switch (this->layer().get_num_parents()) {
     case 0:
-      activations.zero(El::GPUManager::Stream());
+      activations.zero(hydrogen::cuda::GetDefaultStream());
       break;
     case 1:
       dc::tensor::Copy(activations, this->get_prev_activations(),
-                       El::GPUManager::Stream());
+                       hydrogen::cuda::GetDefaultStream());
       break;
     case 2:
       // Optimization for layers with 2 parents (e.g.,
@@ -82,7 +82,7 @@ void sum_distconv_adapter<TensorDataType, Layout, Dev>::fp_compute() {
                             this->get_prev_activations(0),
                             this->get_prev_activations(1),
                             sum_op<TensorDataType>(),
-                            El::GPUManager::Stream());
+                            hydrogen::cuda::GetDefaultStream());
       break;
     default:
       for (int i = 0; i < this->layer().get_num_parents(); ++i) {
@@ -90,11 +90,11 @@ void sum_distconv_adapter<TensorDataType, Layout, Dev>::fp_compute() {
         prev_activations.set_outermost_dimension(activations.get_shape()[-1]);
         if (i == 0) {
           dc::tensor::Copy(activations, prev_activations,
-                           El::GPUManager::Stream());
+                           hydrogen::cuda::GetDefaultStream());
         } else {
           distconv::tensor::Transform(activations, prev_activations,
                                       accumulate_op<TensorDataType>(),
-                                      El::GPUManager::Stream());
+                                      hydrogen::cuda::GetDefaultStream());
         }
       }
   }
diff --git a/src/layers/transform/tessellate.cu b/src/layers/transform/tessellate.cu
index 771a225c3d9..c74d02dc4a5 100644
--- a/src/layers/transform/tessellate.cu
+++ b/src/layers/transform/tessellate.cu
@@ -130,7 +130,7 @@ void tessellate_layer<TensorDataType, T_layout, Dev>
     const auto& local_width = local_output.Width();
     const auto& block_size = 256;
     const auto& grid_size = (local_height * local_width + block_size - 1) / block_size;
-    fp_gpu_3d_kernel<<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
+    fp_gpu_3d_kernel<<<grid_size, block_size, 0, hydrogen::cuda::GetDefaultStream()>>>(
       input_dims[0], input_dims[1], input_dims[2],
       output_dims[0], output_dims[1], output_dims[2],
       local_height, local_width,
@@ -153,7 +153,7 @@ void tessellate_layer<TensorDataType, T_layout, Dev>
     const auto& local_width = local_gradient_wrt_output.Width();
     const auto& block_size = 256;
     const auto& grid_size = (local_height * local_width + block_size - 1) / block_size;
-    bp_gpu_3d_kernel<<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
+    bp_gpu_3d_kernel<<<grid_size, block_size, 0, hydrogen::cuda::GetDefaultStream()>>>(
       input_dims[0], input_dims[1], input_dims[2],
       output_dims[0], output_dims[1], output_dims[2],
       local_height, local_width,
diff --git a/src/models/model.cpp b/src/models/model.cpp
index c3ea344fe6e..a61bfd13525 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -1330,7 +1330,7 @@ bool model::load_from_checkpoint_shared(persist& p) {
     //  }
   p.set_restart_dir(trainer_dir);
 #ifdef LBANN_HAS_GPU
-  El::GPUManager::SynchronizeDevice();
+  hydrogen::gpu::SynchronizeDevice();
 #endif // LBANN_HAS_GPU
   return true;
 }
diff --git a/src/objective_functions/weight_regularization/l2.cpp b/src/objective_functions/weight_regularization/l2.cpp
index 36c94602e34..c73981c3627 100644
--- a/src/objective_functions/weight_regularization/l2.cpp
+++ b/src/objective_functions/weight_regularization/l2.cpp
@@ -119,7 +119,7 @@ void l2_weight_regularization::start_evaluation() {
 #ifdef LBANN_HAS_GPU
   // Compute contributions from GPU weights
   if (m_contributions.count(El::Device::GPU) > 0) {
-    auto&& stream = El::GPUManager::Stream();
+    auto&& stream = hydrogen::cuda::GetDefaultStream();
     DMatType<El::Device::GPU> contribution;
 #ifdef HYDROGEN_HAVE_CUB
     contribution.SetMemoryMode(1); // CUB GPU memory pool
diff --git a/src/objective_functions/weight_regularization/l2.cu b/src/objective_functions/weight_regularization/l2.cu
index 7a823a9d8b6..dabe3c1c730 100644
--- a/src/objective_functions/weight_regularization/l2.cu
+++ b/src/objective_functions/weight_regularization/l2.cu
@@ -81,8 +81,8 @@ void l2_weight_regularization::accumulate_contribution<El::Device::GPU>(const El
     const auto& size = vals.Height() * vals.Width();
     const El::Int block_size = 256;
     const auto& grid_size = (size + block_size - 1) / block_size;
-    auto&& stream = El::GPUManager::Stream();
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    auto&& stream = hydrogen::cuda::GetDefaultStream();
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     accumulate_contribution_kernel<AccumulateDataType, block_size>
       <<<grid_size, block_size, 0, stream>>>(
         vals.Height(), vals.Width(),
diff --git a/src/optimizers/adagrad.cu b/src/optimizers/adagrad.cu
index 9325c5efdb3..22c5b4c6dec 100644
--- a/src/optimizers/adagrad.cu
+++ b/src/optimizers/adagrad.cu
@@ -65,7 +65,7 @@ void adagrad<TensorDataType>::step_compute_gpu(AbsDistMatrixType& values,
   if (local_size > 0) {
     constexpr size_t block_size = 256;
     const size_t grid_size = (local_size + block_size - 1) / block_size;
-    auto&& stream = El::GPUManager::Stream();
+    auto&& stream = hydrogen::cuda::GetDefaultStream();
     adagrad_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
       local_height, local_width,
       this->get_learning_rate(), m_eps,
diff --git a/src/optimizers/adam.cu b/src/optimizers/adam.cu
index 6901a990ead..ac12ebf38c1 100644
--- a/src/optimizers/adam.cu
+++ b/src/optimizers/adam.cu
@@ -96,7 +96,7 @@ void adam<TensorDataType>::step_compute_gpu(AbsDistMatrixType& values,
   // Launch CUDA kernel
   constexpr size_t block_size = 256;
   const size_t grid_size = (local_size + block_size - 1) / block_size;
-  auto&& stream = El::GPUManager::Stream();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
   if (values.Contiguous() && gradient.Contiguous()
       && m_moment1->Contiguous() && m_moment2->Contiguous()) {
     adam_contiguous_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
diff --git a/src/optimizers/rmsprop.cu b/src/optimizers/rmsprop.cu
index e3820a4d22f..4f67dec1ff4 100644
--- a/src/optimizers/rmsprop.cu
+++ b/src/optimizers/rmsprop.cu
@@ -67,7 +67,7 @@ void rmsprop<TensorDataType>::step_compute_gpu(AbsDistMatrixType& values,
   if (local_size > 0) {
     constexpr size_t block_size = 256;
     const size_t grid_size = (local_size + block_size - 1) / block_size;
-    auto&& stream = El::GPUManager::Stream();
+    auto&& stream = hydrogen::cuda::GetDefaultStream();
     rmsprop_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
       local_height, local_width,
       this->get_learning_rate(), m_decay_rate, m_eps,
diff --git a/src/optimizers/sgd.cu b/src/optimizers/sgd.cu
index b33e54ee5d2..a65c51370cb 100644
--- a/src/optimizers/sgd.cu
+++ b/src/optimizers/sgd.cu
@@ -109,7 +109,7 @@ void sgd<TensorDataType>::momentum_step_gpu(AbsDistMatrixType& values,
   // Launch CUDA kernels for momentum SGD or NAG
   constexpr size_t block_size = 256;
   const size_t grid_size = (local_size + block_size - 1) / block_size;
-  auto&& stream = El::GPUManager::Stream();
+  auto&& stream = hydrogen::cuda::GetDefaultStream();
   if (m_nesterov) {
     nesterov_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
       local_height, local_width,
diff --git a/src/utils/cudnn.cpp b/src/utils/cudnn.cpp
index 260fa2f002f..18724b6bc24 100644
--- a/src/utils/cudnn.cpp
+++ b/src/utils/cudnn.cpp
@@ -48,10 +48,10 @@ namespace {
 struct handle_wrapper {
   cudnnHandle_t handle;
   handle_wrapper() : handle(nullptr) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
     if (handle == nullptr) { CHECK_CUDNN(cudnnCreate(&handle)); }
     if (handle == nullptr) { LBANN_ERROR("failed to create cuDNN handle"); }
-    CHECK_CUDNN(cudnnSetStream(handle, El::GPUManager::Stream()));
+    CHECK_CUDNN(cudnnSetStream(handle, hydrogen::cuda::GetDefaultStream()));
   }
   handle_wrapper(const handle_wrapper&) = delete;
   handle_wrapper& operator=(const handle_wrapper&) = delete;
@@ -75,9 +75,9 @@ void destroy() {
 
 cudnnHandle_t& get_handle() {
   if (!handle_instance) { initialize(); }
-  CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+  CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice()));
   CHECK_CUDNN(cudnnSetStream(handle_instance->handle,
-                             El::GPUManager::Stream()));
+                             hydrogen::cuda::GetDefaultStream()));
   return handle_instance->handle;
 }
 
diff --git a/src/utils/distconv.cpp b/src/utils/distconv.cpp
index f20923726fb..34fcf3a6e8e 100644
--- a/src/utils/distconv.cpp
+++ b/src/utils/distconv.cpp
@@ -289,12 +289,12 @@ void initialize(MPI_Comm comm) {
   p2p_instance = new p2p::P2P(mpi_comm);
 #endif // DISTCONV_HAS_P2P
   mpicuda_comm_instance = new Al::mpicuda_backend::comm_type(
-      mpi_comm, El::GPUManager::Stream());
+      mpi_comm, hydrogen::cuda::GetDefaultStream());
   ::distconv::cudnn::Options backend_opts;
   backend_opts.m_deterministic = opt_deterministic;
   backend_instance = new Backend(
       mpi_comm, lbann::cudnn::get_handle(),
-      El::GPUManager::Stream(), backend_opts);
+      hydrogen::cuda::GetDefaultStream(), backend_opts);
   print_options(std::cout);
   initialized = true;
 }
diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp
index 53a0895c312..3c42d6dd461 100644
--- a/src/utils/lbann_library.cpp
+++ b/src/utils/lbann_library.cpp
@@ -392,7 +392,7 @@ void print_lbann_configuration(lbann_comm *comm, int io_threads_per_process, int
             << "  I/O threads per process (+offset) : " << io_threads_per_process
             << " (+" << io_threads_offset << ")" << std::endl;
 #ifdef HYDROGEN_HAVE_CUDA
-  std::cout << "  GPUs on node               : " << El::GPUManager::NumDevices() << std::endl;
+  std::cout << "  GPUs on node               : " << hydrogen::gpu::DeviceCount() << std::endl;
 #endif // HYDROGEN_HAVE_CUDA
   std::cout << std::endl;
 
diff --git a/src/utils/profiling.cpp b/src/utils/profiling.cpp
index cc237411778..e0f26b2c137 100644
--- a/src/utils/profiling.cpp
+++ b/src/utils/profiling.cpp
@@ -73,7 +73,7 @@ void prof_stop() {
 void prof_region_begin(const char *s, int c, bool sync) {
   if (!profiling_started) return;
   if (sync) {
-    El::GPUManager::SynchronizeDevice();
+    hydrogen::gpu::SynchronizeDevice();
   }
   // Doesn't work with gcc 4.9
   // nvtxEventAttributes_t ev = {0};
@@ -90,7 +90,7 @@ void prof_region_begin(const char *s, int c, bool sync) {
 void prof_region_end(const char *, bool sync) {
   if (!profiling_started) return;
   if (sync) {
-    El::GPUManager::SynchronizeDevice();
+    hydrogen::gpu::SynchronizeDevice();
   }
   nvtxRangePop();
 }
diff --git a/src/weights/initializer.cpp b/src/weights/initializer.cpp
index 8e3bf294dd3..a941909e01a 100644
--- a/src/weights/initializer.cpp
+++ b/src/weights/initializer.cpp
@@ -95,7 +95,7 @@ void value_initializer<TensorDataType>::fill(AbsDistMatrixType& matrix) {
   if (matrix.GetLocalDevice() != El::Device::CPU) {
     El::Copy(matrix_cpu, matrix.Matrix());
 #ifdef HYDROGEN_HAVE_CUDA
-    El::GPUManager::SynchronizeStream(); /// @todo Use new Hydrogen synchronization semantics when available
+    Synchronize(hydrogen::gpu::DefaultSyncInfo()); /// @todo Use new Hydrogen synchronization semantics when available
 #endif // HYDROGEN_HAVE_CUDA
   }
 

From 831e57b306b96d19e0951b9be58fcd8ac66fcd11 Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Fri, 31 Jul 2020 09:36:39 -0700
Subject: [PATCH 02/36] Updated the thread topology code to be compliant with
 Hydrogen 1.4 API (#1591)

---
 src/utils/threads/thread_topology.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/threads/thread_topology.cpp b/src/utils/threads/thread_topology.cpp
index a2c52e1228d..362d6151812 100644
--- a/src/utils/threads/thread_topology.cpp
+++ b/src/utils/threads/thread_topology.cpp
@@ -156,7 +156,7 @@ hwloc_cpuset_t get_local_cpuset_for_current_thread(hwloc_topology_t topo) {
   hwloc_cpuset_t local_cpuset = hwloc_bitmap_alloc();
 #ifdef LBANN_HAS_GPU
   // Find CPUs close to the GPU being used
-  hwloc_cudart_get_device_cpuset(topo, hydrogen::GPUManager::Device(), local_cpuset);
+  hwloc_cudart_get_device_cpuset(topo, hydrogen::gpu::DefaultDevice(), local_cpuset);
 #else
   hwloc_const_cpuset_t allowed_cpuset = hwloc_topology_get_allowed_cpuset(topo);
   local_cpuset = hwloc_bitmap_dup(allowed_cpuset);

From b08fe0cd5b6e2f3dd70841afe1d41f63e8a035cf Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Fri, 31 Jul 2020 17:33:10 -0700
Subject: [PATCH 03/36] Update spack to new versions (#1593)

* Updated the versions of CMake and CUDA used for building LBANN.  Also
included externals for IvyBridge CPUs.

* Updated the versions of Aluminum and Hydrogen to V0.4 and v1.4,
respectively.

Also temporarily disabled DiHydrogen.

* Updated version of OpenBLAS and Conduit

* Removed flag to build docs for DiHydrogen.

* Updated CUDA version on Power to 10.2.89

* Apply suggestions from code review

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>
---
 scripts/install_lbann.sh                      |  9 ++-
 .../externals-linux-rhel7-broadwell.sh        | 14 ++--
 .../llnl_lc/externals-linux-rhel7-haswell.sh  | 12 +--
 .../externals-linux-rhel7-ivybridge.sh        | 75 +++++++++++++++++++
 .../llnl_lc/externals-linux-rhel7-power8le.sh | 10 +--
 .../llnl_lc/externals-linux-rhel7-power9le.sh | 14 ++--
 .../externals-cray-cnl7-skylake_avx512.sh     |  2 +-
 .../std_versions_and_variants.sh              |  4 +-
 8 files changed, 108 insertions(+), 32 deletions(-)
 create mode 100644 spack_environments/llnl_lc/externals-linux-rhel7-ivybridge.sh

diff --git a/scripts/install_lbann.sh b/scripts/install_lbann.sh
index e82b3587130..c0c669c18dc 100755
--- a/scripts/install_lbann.sh
+++ b/scripts/install_lbann.sh
@@ -194,7 +194,7 @@ EOF
         SUPERBUILD_SPECS=$(cat <<EOF
   - aluminum
   - hydrogen
-  - dihydrogen
+#  - dihydrogen
 EOF
 )
     fi
@@ -237,7 +237,8 @@ fi
 
 AL_VARIANTS=
 if [[ "${ENABLE_GPUS}" == "ON" ]]; then
-    AL_VARIANTS="variants: +gpu+nccl+mpi_cuda"
+#    CUDA_ARCH="cuda_arch=60,61,62,70"
+    AL_VARIANTS="variants: +cuda +nccl +ht +mpi_gpu_rdma"
     HYDROGEN_VARIANTS="${HYDROGEN_VARIANTS} +cuda"
     DIHYDROGEN_VARIANTS="${DIHYDROGEN_VARIANTS} +cuda +legacy"
 fi
@@ -257,7 +258,7 @@ ${STD_PACKAGES}
 
     aluminum:
       buildable: true
-      version: [0.3.3]
+      version: [0.4.0]
       ${AL_VARIANTS}
       providers: {}
       paths: {}
@@ -266,7 +267,7 @@ ${STD_PACKAGES}
       target: []
     hydrogen:
       buildable: true
-      version: [1.3.4]
+      version: [1.4.0]
       ${HYDROGEN_VARIANTS}
       providers: {}
       paths: {}
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh b/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
index d53505f0617..385ef4e51ab 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
@@ -17,19 +17,19 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     cmake::
       buildable: True
       variants: ~openssl ~ncurses
-      version: [3.14.5]
-      paths:
-        cmake@3.14.5 arch=linux-rhel7-broadwell:  /usr/tce/packages/cmake/cmake-3.14.5
+      version: [3.18.0]
+      modules:
+        cmake@3.18.0 arch=linux-rhel7-broadwell:  cmake/3.18.0
 
     cuda::
       buildable: False
-      version: [10.1.168]
+      version: [10.2.89]
       modules:
-        cuda@10.1.168 arch=linux-rhel7-broadwell: cuda/10.1.168
+        cuda@10.2.89 arch=linux-rhel7-broadwell: cuda/10.2.89
 
     cudnn::
       buildable: true
-      version: [7.6.5.32-10.1-linux-x64]
+      version: [7.6.5.32-10.2-linux-x64]
 
     gcc::
        buildable: False
@@ -52,7 +52,7 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     openblas::
       buildable: True
       variants: threads=openmp
-      version: [0.3.6]
+      version: [0.3.10]
 
     opencv::
       buildable: true
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh b/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
index 1a5f2846240..fceb50c9220 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
@@ -17,15 +17,15 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     cmake::
       buildable: True
       variants: ~openssl ~ncurses
-      version: [3.14.5]
-      paths:
-        cmake@3.14.5 arch=linux-rhel7-haswell:  /usr/tce/packages/cmake/cmake-3.14.5
+      version: [3.18.0]
+      modules:
+        cmake@3.18.0 arch=linux-rhel7-haswell:  cmake/3.18.0
 
     cuda::
       buildable: False
-      version: [10.1.168]
+      version: [10.2.89]
       modules:
-        cuda@10.1.168 arch=linux-rhel7-haswell: cuda/10.1.168
+        cuda@10.2.89 arch=linux-rhel7-haswell: cuda/10.2.89
 
     cudnn::
       buildable: true
@@ -52,7 +52,7 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     openblas::
       buildable: True
       variants: threads=openmp
-      version: [0.3.6]
+      version: [0.3.10]
 
     opencv::
       buildable: true
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-ivybridge.sh b/spack_environments/llnl_lc/externals-linux-rhel7-ivybridge.sh
new file mode 100644
index 00000000000..339bcc22c9c
--- /dev/null
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-ivybridge.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+
+EXTERNAL_ALL_PACKAGES=$(cat <<EOF
+    all:
+      providers:
+        mpi: [mvapich2@2.3 arch=linux-rhel7-ivybridge]
+        lapack: [openblas threads=openmp]
+        blas: [openblas threasd=openmp]
+      buildable: true
+      version: []
+      paths: {}
+      modules: {}
+EOF
+)
+
+EXTERNAL_PACKAGES=$(cat <<EOF
+    cmake::
+      buildable: True
+      variants: ~openssl ~ncurses
+      version: [3.18.0]
+      modules:
+        cmake@3.18.0 arch=linux-rhel7-ivybridge:  cmake/3.18.0
+
+    cuda::
+      buildable: False
+      version: [10.2.89]
+      modules:
+        cuda@10.2.89 arch=linux-rhel7-ivybridge: cuda/10.2.89
+
+    cudnn::
+      buildable: true
+      version: [8.0.0.180-10.2-linux-x64]
+
+    gcc::
+       buildable: False
+       version: [7.3.0]
+       modules:
+         gcc@7.3.0 arch=linux-rhel7-ivybridge: gcc/7.3.0
+
+    hwloc::
+      buildable: False
+      version: [2.0.2]
+      paths:
+        hwloc@2.0.2 arch=linux-rhel7-ivybridge: /usr/lib64/libhwloc.so
+
+    mvapich2::
+      buildable: True
+      version: [2.3]
+      paths:
+        mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-ivybridge: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/
+
+    openblas::
+      buildable: True
+      variants: threads=openmp
+      version: [0.3.10]
+
+    opencv::
+      buildable: true
+      variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
+      version: [4.1.0]
+
+    python::
+      buildable: True
+      variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
+      version: [3.7.2]
+      modules:
+        python@3.7.2 arch=linux-rhel7-ivybridge: python/3.7.2
+
+    rdma-core::
+      buildable: False
+      version: [20]
+      paths:
+        rdma-core@20 arch=linux-rhel7-ivybridge: /usr
+EOF
+)
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh b/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
index 1520e243c9a..e412e020233 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
@@ -18,14 +18,14 @@ EXTERNAL_PACKAGES=$(cat <<EOF
       buildable: True
       variants: ~openssl ~ncurses
       version: [3.14.5]
-      paths:
-        cmake@3.14.5 arch=linux-rhel7-power8le:   /usr/tce/packages/cmake/cmake-3.14.5
+      modules:
+        cmake@3.14.5 arch=linux-rhel7-power8le: cmake/3.14.5
 
     cuda::
       buildable: False
-      version: [10.1.243]
+      version: [10.2.89]
       modules:
-        cuda@10.1.243 arch=linux-rhel7-power8le: cuda/10.1.243
+        cuda@10.2.89 arch=linux-rhel7-power8le: cuda/10.2.89
 
     cudnn::
       buildable: true
@@ -46,7 +46,7 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     openblas::
       buildable: True
       variants: threads=openmp ~avx2 ~avx512
-      version: [0.3.6]
+      version: [0.3.10]
 
     opencv::
       buildable: true
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh b/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
index 1848bff85f0..e36dc5cabf1 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
@@ -17,19 +17,19 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     cmake::
       buildable: True
       variants: ~openssl ~ncurses
-      version: [3.14.5]
-      paths:
-        cmake@3.14.5 arch=linux-rhel7-power9le:   /usr/tce/packages/cmake/cmake-3.14.5
+      version: [3.18.0]
+      modules:
+        cmake@3.18.0 arch=linux-rhel7-power9le: cmake/3.18.0
 
     cuda::
       buildable: False
-      version: [10.1.243]
+      version: [10.2.89]
       modules:
-        cuda@10.1.243 arch=linux-rhel7-power9le: cuda/10.1.243
+        cuda@10.2.89 arch=linux-rhel7-power9le: cuda/10.2.89
 
     cudnn::
       buildable: true
-      version: [7.6.5.32-10.1-linux-ppc64le]
+      version: [7.6.5.32-10.2-linux-ppc64le]
 
     gcc::
        buildable: False
@@ -46,7 +46,7 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     openblas::
       buildable: True
       variants: threads=openmp ~avx2 ~avx512
-      version: [0.3.6]
+      version: [0.3.10]
 
     opencv::
       buildable: true
diff --git a/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh b/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh
index ba555d7f806..b78664362a8 100644
--- a/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh
+++ b/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh
@@ -58,7 +58,7 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     openblas::
       buildable: True
       variants: threads=openmp
-      version: [0.3.6]
+      version: [0.3.10]
 
     opencv::
       buildable: true
diff --git a/spack_environments/std_versions_and_variants.sh b/spack_environments/std_versions_and_variants.sh
index 6e80d886a78..c54558baa34 100644
--- a/spack_environments/std_versions_and_variants.sh
+++ b/spack_environments/std_versions_and_variants.sh
@@ -3,7 +3,7 @@
 STD_PACKAGES=$(cat <<EOF
     cereal::
       buildable: true
-      version: [1.2.2]
+      version: [1.3.0]
 
     conduit::
       buildable: true
@@ -21,7 +21,7 @@ STD_PACKAGES=$(cat <<EOF
 
     nccl::
       buildable: true
-      version: [2.5.7-1]
+      version: [2.7.8-1]
 
     protobuf::
       buildable: True

From 0e33ddbb0c55448fb4b75a9c0d153a68d44388ab Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Fri, 31 Jul 2020 17:34:41 -0700
Subject: [PATCH 04/36] Update superbuild to use Hydrogen 1.4.0 and Aluminum
 0.4.0 (#1594)

---
 scripts/build_lbann_lc.sh          | 3 +--
 superbuild/aluminum/CMakeLists.txt | 2 +-
 superbuild/hydrogen/CMakeLists.txt | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/build_lbann_lc.sh b/scripts/build_lbann_lc.sh
index 8745243528d..a41b00e2975 100755
--- a/scripts/build_lbann_lc.sh
+++ b/scripts/build_lbann_lc.sh
@@ -330,7 +330,7 @@ fi
 # Load packages
 if [ ${USE_MODULES} -ne 0 ]; then
     module load git
-    module load cmake/3.14.5
+    module load cmake/3.16.8
 else
     use git
 fi
@@ -805,7 +805,6 @@ cmake \
 -D LBANN_SB_BUILD_PROTOBUF=ON \
 -D LBANN_SB_BUILD_CUB=${WITH_CUB} \
 -D LBANN_SB_BUILD_ALUMINUM=${WITH_ALUMINUM} \
--D ALUMINUM_TAG=v0.3.3 \
 -D ALUMINUM_ENABLE_MPI_CUDA=${ALUMINUM_WITH_MPI_CUDA} \
 -D ALUMINUM_ENABLE_NCCL=${ALUMINUM_WITH_NCCL} \
 -D LBANN_SB_BUILD_CONDUIT=${WITH_CONDUIT} \
diff --git a/superbuild/aluminum/CMakeLists.txt b/superbuild/aluminum/CMakeLists.txt
index 75ac7faf73a..b97e8043057 100644
--- a/superbuild/aluminum/CMakeLists.txt
+++ b/superbuild/aluminum/CMakeLists.txt
@@ -11,7 +11,7 @@ else ()
     CACHE STRING "The URL from which to clone Aluminum")
 endif ()
 
-set(ALUMINUM_TAG "v0.3.3"
+set(ALUMINUM_TAG "v0.4.0"
   CACHE STRING "The git tag to checkout for Aluminum")
 
 set(ALUMINUM_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
diff --git a/superbuild/hydrogen/CMakeLists.txt b/superbuild/hydrogen/CMakeLists.txt
index bda354252b9..15107a2890a 100644
--- a/superbuild/hydrogen/CMakeLists.txt
+++ b/superbuild/hydrogen/CMakeLists.txt
@@ -109,7 +109,7 @@ else ()
 endif ()
 
 # ... then the tag.
-set(HYDROGEN_TAG "v1.3.4"
+set(HYDROGEN_TAG "v1.4.0"
   CACHE STRING "The git tag or hash to checkout for Hydrogen")
 
 if (HYDROGEN_CUSTOM_SOURCE_DIR)

From ddb88cc139203f6156aa728b94e3611574fe794b Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Tue, 4 Aug 2020 17:06:07 -0700
Subject: [PATCH 05/36] Update spack build (#1595)

* Updated to use new Aluminum variant and reenabled DiHydrogen.

* Reverted CUDA back to 10.1.168 and bumped cuDNN to 8.0.2
---
 scripts/install_lbann.sh                                    | 4 ++--
 .../llnl_lc/externals-linux-rhel7-broadwell.sh              | 6 +++---
 .../llnl_lc/externals-linux-rhel7-power9le.sh               | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/install_lbann.sh b/scripts/install_lbann.sh
index c0c669c18dc..c49460aeaef 100755
--- a/scripts/install_lbann.sh
+++ b/scripts/install_lbann.sh
@@ -194,7 +194,7 @@ EOF
         SUPERBUILD_SPECS=$(cat <<EOF
   - aluminum
   - hydrogen
-#  - dihydrogen
+  - dihydrogen
 EOF
 )
     fi
@@ -238,7 +238,7 @@ fi
 AL_VARIANTS=
 if [[ "${ENABLE_GPUS}" == "ON" ]]; then
 #    CUDA_ARCH="cuda_arch=60,61,62,70"
-    AL_VARIANTS="variants: +cuda +nccl +ht +mpi_gpu_rdma"
+    AL_VARIANTS="variants: +cuda +nccl +ht +cuda_rma"
     HYDROGEN_VARIANTS="${HYDROGEN_VARIANTS} +cuda"
     DIHYDROGEN_VARIANTS="${DIHYDROGEN_VARIANTS} +cuda +legacy"
 fi
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh b/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
index 385ef4e51ab..9da7e1cca0b 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
@@ -23,13 +23,13 @@ EXTERNAL_PACKAGES=$(cat <<EOF
 
     cuda::
       buildable: False
-      version: [10.2.89]
+      version: [10.1.168]
       modules:
-        cuda@10.2.89 arch=linux-rhel7-broadwell: cuda/10.2.89
+        cuda@10.1.168 arch=linux-rhel7-broadwell: cuda/10.1.168
 
     cudnn::
       buildable: true
-      version: [7.6.5.32-10.2-linux-x64]
+      version: [8.0.2.39-10.1-linux-x64]
 
     gcc::
        buildable: False
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh b/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
index e36dc5cabf1..673d955434e 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
@@ -23,13 +23,13 @@ EXTERNAL_PACKAGES=$(cat <<EOF
 
     cuda::
       buildable: False
-      version: [10.2.89]
+      version: [10.1.168]
       modules:
-        cuda@10.2.89 arch=linux-rhel7-power9le: cuda/10.2.89
+        cuda@10.1.168 arch=linux-rhel7-power9le: cuda/10.1.168
 
     cudnn::
       buildable: true
-      version: [7.6.5.32-10.2-linux-ppc64le]
+      version: [8.0.2.39-10.1-linux-ppc64le]
 
     gcc::
        buildable: False

From 4e6694faeaadb368279e38b8bbb61386401ef83d Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Wed, 5 Aug 2020 10:37:20 -0700
Subject: [PATCH 06/36] Add support for distributed embedding layer with double
 datatype (#1590)

---
 src/layers/misc/dist_embedding.cpp | 71 ++++++++++--------------------
 src/layers/misc/dist_embedding.cu  |  2 +
 2 files changed, 25 insertions(+), 48 deletions(-)

diff --git a/src/layers/misc/dist_embedding.cpp b/src/layers/misc/dist_embedding.cpp
index 735ca420f8e..f8f135bf9a9 100644
--- a/src/layers/misc/dist_embedding.cpp
+++ b/src/layers/misc/dist_embedding.cpp
@@ -366,56 +366,27 @@ struct Builder
   }
 };
 
-template <>
-struct Builder<float,data_layout::DATA_PARALLEL,El::Device::CPU>
-{
-  template <typename... Args>
-  static std::unique_ptr<Layer> Build(Args&&... args)
-  {
-    using TensorDataType = float;
-    constexpr data_layout Layout = data_layout::DATA_PARALLEL;
-    constexpr El::Device Device = El::Device::CPU;
+#define DEFINE_BUILDER(TensorDataType, Device)                          \
+template <>                                                             \
+struct Builder<TensorDataType,data_layout::DATA_PARALLEL,Device>        \
+{                                                                       \
+  template <typename... Args>                                           \
+    static std::unique_ptr<Layer> Build(Args&&... args)                 \
+  {                                                                     \
+    constexpr data_layout Layout = data_layout::DATA_PARALLEL;          \
+    using LayerType = dist_embedding_layer<TensorDataType,Layout,Device>; \
+    return make_unique<LayerType>(std::forward<Args>(args)...);         \
+  }                                                                     \
+}
 #ifdef LBANN_HAS_SHMEM
-    using LayerType = dist_embedding_layer<TensorDataType,Layout,Device>;
-    return make_unique<LayerType>(std::forward<Args>(args)...);
-#else
-    LBANN_ERROR(
-      "Attempted to construct CPU dist_embedding_layer, ",
-      "but LBANN has not been built with OpenSHMEM support "
-      "(TensorDataType=",TypeName<TensorDataType>(),", ",
-      "Layout=",to_string(Layout),", ",
-      "Device=",to_string(Device),")");
-    return nullptr;
+DEFINE_BUILDER(float, El::Device::CPU);
+DEFINE_BUILDER(double, El::Device::CPU);
 #endif // LBANN_HAS_SHMEM
-  }
-};
-
-#ifdef LBANN_HAS_GPU
-template <>
-struct Builder<float,data_layout::DATA_PARALLEL,El::Device::GPU>
-{
-  template <typename... Args>
-  static std::unique_ptr<Layer> Build(Args&&... args)
-  {
-    using TensorDataType = float;
-    constexpr data_layout Layout = data_layout::DATA_PARALLEL;
-    constexpr El::Device Device = El::Device::GPU;
-#ifdef LBANN_HAS_NVSHMEM
-    using LayerType = dist_embedding_layer<TensorDataType,Layout,Device>;
-    return make_unique<LayerType>(std::forward<Args>(args)...);
-#else
-    LBANN_ERROR(
-      "Attempted to construct GPU dist_embedding_layer, ",
-      "but LBANN has not been built with NVSHMEM support "
-      "(TensorDataType=",TypeName<TensorDataType>(),", ",
-      "Layout=",to_string(Layout),", ",
-      "Device=",to_string(Device),")");
-    return nullptr;
-#endif // LBANN_HAS_NVSHMEM
-  }
-
-};
-#endif // LBANN_HAS_GPU
+#if defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM)
+DEFINE_BUILDER(float, El::Device::GPU);
+DEFINE_BUILDER(double, El::Device::GPU);
+#endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM)
+#undef DEFINE_BUILDER
 
 } // namespace <anon>
 
@@ -444,10 +415,14 @@ std::unique_ptr<Layer> build_dist_embedding_layer_from_pbuf(
 #ifdef LBANN_HAS_SHMEM
 template class dist_embedding_layer<
   float, data_layout::DATA_PARALLEL, El::Device::CPU>;
+template class dist_embedding_layer<
+  double, data_layout::DATA_PARALLEL, El::Device::CPU>;
 #endif // LBANN_HAS_SHMEM
 #if defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM)
 extern template class dist_embedding_layer<
   float, data_layout::DATA_PARALLEL, El::Device::GPU>;
+extern template class dist_embedding_layer<
+  double, data_layout::DATA_PARALLEL, El::Device::GPU>;
 #endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM)
 
 #define PROTO_DEVICE(T, Device)                         \
diff --git a/src/layers/misc/dist_embedding.cu b/src/layers/misc/dist_embedding.cu
index d2a7ee2e895..b6b8901f9f8 100644
--- a/src/layers/misc/dist_embedding.cu
+++ b/src/layers/misc/dist_embedding.cu
@@ -696,6 +696,8 @@ void dist_embedding_layer<TensorDataType,Layout,Device>::apply_sparse_sgd_step(
 /// @todo fp16
 template class dist_embedding_layer<
   float, data_layout::DATA_PARALLEL, El::Device::GPU>;
+template class dist_embedding_layer<
+  double, data_layout::DATA_PARALLEL, El::Device::GPU>;
 
 } // namespace lbann
 #endif // LBANN_HAS_NVSHMEM

From 9502b2d6a050d4d9834ac638e87a639ee233a2c1 Mon Sep 17 00:00:00 2001
From: Shehtab Zaman <szaman5@binghamton.edu>
Date: Fri, 7 Aug 2020 18:23:54 -0400
Subject: [PATCH 07/36] Graph Convolution Layers in Python Front End Modules
 (#1592)

* Added GCN implementation

- Added download script for MNIST_Superpixel
- Allows for pre-processing upon download
- Added python reader for training data
- train.py contains single layer GCN

* Updating train.py to reinsert GCN layer

* Added new graph kernels and new dataset
- Added new graph kernel implementations: GCN, GIN, Graph
- Created new method for handling slicing in data

* Completed PROTEINS dataset

- Updated Graph_Kernels, main, and train py
- Graph_Kernels are currently working in both dense and sliced versions
- PROTEINS data can be extracted and used with current format

* Updated weights init with std = 1/output_channels to correct exploding loss

* Added Sparse_Train model

* Added graph data.py

* Refactoreed to mimic proper modules structure

* Updated with proper local importing.

Deleting files and moving over to new file structure

* Added training files for GCN, GIN, and Graph convolutions

* Added documentation for the GCN layer

* Renamed directory

* Committing changes for rebase

* Modified to use data_store in local cache mode. Modified for "poor man's LTFB"

* bug fix

in copy_members(), setting something to a null ptr. Only showed up when using a carved validation set.

* push test

* adding m_verbose option.

* tweaks and bug fixes.

* Allow dump weights to be used with LTFB (#1568)

* Updated the dump weights callback to properly handly output from
multiple models and trainers so that they don't clobber each other.

* Fixed the name in the proto-interface.

* Update convert_npz_to_conduit.cpp

Removed RNG seed from the initialize function.

* Added GCN implementation

- Added download script for MNIST_Superpixel
- Allows for pre-processing upon download
- Added python reader for training data
- train.py contains single layer GCN

* Updating train.py to reinsert GCN layer

* Added new graph kernels and new dataset
- Added new graph kernel implementations: GCN, GIN, Graph
- Created new method for handling slicing in data

* Completed PROTEINS dataset

- Updated Graph_Kernels, main, and train py
- Graph_Kernels are currently working in both dense and sliced versions
- PROTEINS data can be extracted and used with current format

* Updated weights init with std = 1/output_channels to correct exploding loss

* Added Sparse_Train model

* Added graph data.py

* Refactoreed to mimic proper modules structure

* Updated with proper local importing.

Deleting files and moving over to new file structure

* Added training files for GCN, GIN, and Graph convolutions

* Added documentation for the GCN layer

* Renamed directory

* Committing changes for rebase

* Fixed merge issues

* Fixed accidental changes of Smiles Data Reader

* Fixed accidental changes of Smiles Data Reader

* Adding Graph Layers in python/lbann/modules

- Added GAT,GCN,Graph, and GIN layers
- Added a utility class for handling graph vertex data

* GINConv tested and complete

* Added Grated Graph Convolution Layer

* Added Sparse_Graph_Trainer to simplify model generation. All graph layers currently generating models

* Working versions for GCN, GIN, Graph, and GatedGraph convolutions kernels.

- All 4 'sparse' graph layers generating models and learning (decreaseing, non-hanged objective functions)

* Added testing files for sparse graph layers

* Update python/lbann/modules/graph/sparse/GINConv.py

Updating documentation to properly use modules.

Co-authored-by: Tim Moon <moon13@llnl.gov>

* - Renamed GATConb.py to GatedGraphConv.py
- Updated documentation on GCNConv to highlight that the adjacency matrix
is assumed to be normalized (or symmetrically normalized)
- Updated utils.py to concatenate slices in on step

* Added updated tests for Dense Graph Kernels

- Updated GINConv and GatedGraphConv to use GraphVertexData method to update shape
- Fixed typo in DenseGraph Conv
- Fixed extranous line in util.py

Co-authored-by: David A. Hysom <hysom@llnl.gov>
Co-authored-by: davidHysom <hysom1@llnl.gov>
Co-authored-by: Brian Van Essen <vanessen1@llnl.gov>
Co-authored-by: Tim Moon <moon13@llnl.gov>
---
 applications/MOF/MOFae.py                     |   5 +-
 applications/MOF/README.md                    |   5 +-
 applications/graph/GNN/Dense_Graph_Trainer.py | 205 +++++++++++++++
 applications/graph/GNN/README.md              |  48 ++++
 .../graph/GNN/Sparse_Graph_Trainer.py         | 245 ++++++++++++++++++
 .../GNN/data/MNIST_Superpixel/__init__.py     |  24 ++
 .../data/MNIST_Superpixel/update_adj_mat.py   |  10 +
 .../graph/GNN/data/MNIST_Superpixel/utils.py  | 100 +++++++
 .../GNN/data/PROTEINS/PROTEINS_Dataset.py     |  76 ++++++
 .../graph/GNN/data/PROTEINS/__init__.py       |  24 ++
 applications/graph/GNN/data/PROTEINS/utils.py | 126 +++++++++
 applications/graph/GNN/data/__init__.py       |   0
 applications/graph/GNN/main.py                |  91 +++++++
 applications/graph/GNN/test/__init__.py       |   0
 applications/graph/GNN/test/conftest.py       |  41 +++
 .../graph/GNN/test/test_integration_DGCN.py   | 153 +++++++++++
 .../graph/GNN/test/test_integration_DGraph.py | 153 +++++++++++
 .../graph/GNN/test/test_integration_GCN.py    | 153 +++++++++++
 .../graph/GNN/test/test_integration_GIN.py    | 153 +++++++++++
 .../GNN/test/test_integration_GatedGraph.py   | 153 +++++++++++
 .../graph/GNN/test/test_integration_Graph.py  | 153 +++++++++++
 python/lbann/modules/__init__.py              |   1 +
 python/lbann/modules/graph/__init__.py        |  12 +
 .../lbann/modules/graph/dense/DenseGCNConv.py |  27 ++
 .../modules/graph/dense/DenseGraphConv.py     |  34 +++
 python/lbann/modules/graph/dense/__init__.py  |   7 +
 python/lbann/modules/graph/sparse/GCNConv.py  | 120 +++++++++
 python/lbann/modules/graph/sparse/GINConv.py  |  76 ++++++
 .../modules/graph/sparse/GatedGraphConv.py    |  98 +++++++
 .../lbann/modules/graph/sparse/GraphConv.py   | 132 ++++++++++
 python/lbann/modules/graph/sparse/__init__.py |  11 +
 python/lbann/modules/graph/utils.py           | 103 ++++++++
 32 files changed, 2535 insertions(+), 4 deletions(-)
 create mode 100644 applications/graph/GNN/Dense_Graph_Trainer.py
 create mode 100644 applications/graph/GNN/README.md
 create mode 100644 applications/graph/GNN/Sparse_Graph_Trainer.py
 create mode 100644 applications/graph/GNN/data/MNIST_Superpixel/__init__.py
 create mode 100644 applications/graph/GNN/data/MNIST_Superpixel/update_adj_mat.py
 create mode 100644 applications/graph/GNN/data/MNIST_Superpixel/utils.py
 create mode 100644 applications/graph/GNN/data/PROTEINS/PROTEINS_Dataset.py
 create mode 100644 applications/graph/GNN/data/PROTEINS/__init__.py
 create mode 100644 applications/graph/GNN/data/PROTEINS/utils.py
 create mode 100644 applications/graph/GNN/data/__init__.py
 create mode 100644 applications/graph/GNN/main.py
 create mode 100644 applications/graph/GNN/test/__init__.py
 create mode 100644 applications/graph/GNN/test/conftest.py
 create mode 100644 applications/graph/GNN/test/test_integration_DGCN.py
 create mode 100644 applications/graph/GNN/test/test_integration_DGraph.py
 create mode 100644 applications/graph/GNN/test/test_integration_GCN.py
 create mode 100644 applications/graph/GNN/test/test_integration_GIN.py
 create mode 100644 applications/graph/GNN/test/test_integration_GatedGraph.py
 create mode 100644 applications/graph/GNN/test/test_integration_Graph.py
 create mode 100644 python/lbann/modules/graph/__init__.py
 create mode 100644 python/lbann/modules/graph/dense/DenseGCNConv.py
 create mode 100644 python/lbann/modules/graph/dense/DenseGraphConv.py
 create mode 100644 python/lbann/modules/graph/dense/__init__.py
 create mode 100644 python/lbann/modules/graph/sparse/GCNConv.py
 create mode 100644 python/lbann/modules/graph/sparse/GINConv.py
 create mode 100644 python/lbann/modules/graph/sparse/GatedGraphConv.py
 create mode 100644 python/lbann/modules/graph/sparse/GraphConv.py
 create mode 100644 python/lbann/modules/graph/sparse/__init__.py
 create mode 100644 python/lbann/modules/graph/utils.py

diff --git a/applications/MOF/MOFae.py b/applications/MOF/MOFae.py
index bfef507aaad..90d235ff11b 100644
--- a/applications/MOF/MOFae.py
+++ b/applications/MOF/MOFae.py
@@ -7,10 +7,11 @@
 # ----------------------------------
 def gen_layers(latent_dim, number_of_atoms):
     ''' Generates the model for the 3D Convolutional Auto Encoder. 
-
-        returns the Directed Acyclic Graph (DAG) that the lbann 
+        
+                returns the Directed Acyclic Graph (DAG) that the lbann 
         model will run on. 
     '''
+    
     input_ = lbann.Input( target_mode = "reconstruction")
     tensors = lbann.Identity(input_)
 
diff --git a/applications/MOF/README.md b/applications/MOF/README.md
index d5444889519..add112bc6b1 100644
--- a/applications/MOF/README.md
+++ b/applications/MOF/README.md
@@ -35,7 +35,6 @@ python3 -m pytest
 For more information on the data representation: 
 
 
-
 @article {Kimeaax9324,
 	author = {Kim, Baekjun and Lee, Sangwon and Kim, Jihan},
 	title = {Inverse design of porous materials using artificial neural networks},
@@ -49,4 +48,6 @@ For more information on the data representation:
 	eprint = {https://advances.sciencemag.org/content/6/1/eaax9324.full.pdf},
 	journal = {Science Advances}
 }
- 
+
+The model is based on the work supported by the National Science Foundation under Grant No. DMR-1940243.
+Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation
diff --git a/applications/graph/GNN/Dense_Graph_Trainer.py b/applications/graph/GNN/Dense_Graph_Trainer.py
new file mode 100644
index 00000000000..42a2e0fb473
--- /dev/null
+++ b/applications/graph/GNN/Dense_Graph_Trainer.py
@@ -0,0 +1,205 @@
+import lbann
+from lbann.util import str_list
+from lbann.modules.graph import DenseGCNConv, DenseGraphConv
+
+
+def DGCN_layer(feature_matrix,adj_matrix, node_features):
+    """An example 3 layer GCN kernel.
+    Args:
+        feature_matrix (Layer): Node feature layer. Should have the shape:
+                                (num_nodes, node_features)
+        adj_matrix (Layer): Adjancency matrix layer. Should have the shape: 
+                            (num_nodes, num_nodes)
+        node_features (int): The number of features per node
+    Returns: 
+        (Layer): Returns the new embedding of the node features
+    """
+    out_channel_1 = 1024
+    out_channel_2 = 512
+    out_channel_3 = 256
+    
+    gcn1 = DenseGCNConv(input_channels = node_features, output_channels = out_channel_1)
+    gcn2 = DenseGCNConv(input_channels = out_channel_1, output_channels = out_channel_2)
+    gcn3 = DenseGCNConv(input_channels = out_channel_2, output_channels = out_channel_3)
+    
+    out_channel = out_channel_3
+    
+    x = gcn1(feature_matrix, adj_matrix )
+    x = lbann.Relu(x,name="DGCN1_activation") 
+
+    x = gcn2(x, adj_matrix)
+    x = lbann.Relu(x, name="DGCN2_activation")
+    
+    x = gcn3 (x, adj_matrix)
+    x = lbann.Relu(x, name="DGCN3_activation")
+    return x
+
+
+def DGraph_Layer(feature_matrix,adj_matrix, node_features):
+    """An example 3 layer Graph kernel.
+    Args:
+        feature_matrix (Layer): Node feature layer. Should have the shape:
+                                (num_nodes, node_features)
+        adj_matrix (Layer): Adjancency matrix layer. Should have the shape: 
+                            (num_nodes, num_nodes)
+        node_features (int): The number of features per node
+    Returns: 
+        (Layer): Returns the new embedding of the node features
+    """
+    out_channel_1 = 1024
+    out_channel_2 = 512
+    out_channel_3 = 256
+    
+    gcn1 = DenseGraphConv(input_channels = node_features, output_channels = out_channel_1)
+    gcn2 = DenseGraphConv(input_channels = out_channel_1, output_channels = out_channel_2)
+    gcn3 = DenseGraphConv(input_channels = out_channel_2, output_channels = out_channel_3)
+    
+    out_channel = out_channel_3
+    
+    x = gcn1(feature_matrix, adj_matrix )
+    x = lbann.Relu(x,name="DGraph1_activation") 
+
+    x = gcn2(x, adj_matrix)
+    x = lbann.Relu(x, name="DGraph2_activation")
+    
+    x = gcn3 (x, adj_matrix)
+    x = lbann.Relu(x, name="DGraph3_activation")
+    return x
+
+
+def make_model(num_vertices = None, 
+               node_features = None, 
+               num_classes = None,
+               dataset = None,
+               kernel_type = 'GCN',
+               callbacks = None,
+               num_epochs = 1):
+    '''Construct a model DAG using one of the Graph Kernels
+
+    Args:
+        num_vertices (int): Number of vertices of each graph (default: None) 
+        node_features (int): Number of features per noded (default: None)
+        num_classes (int): Number of classes as targets (default: None)
+        dataset (str): Preset data set to use. Either a datset parameter has to be 
+                       supplied or all of num_vertices, node_features, and 
+                       num_classes have to be supplied. (default: None) 
+        kernel_type (str): Graph Kernel to use in model. Expected one of 
+                            GCN, or Graph (deafult: GCN)
+        callbacks (list): Callbacks for the model. If set to None the model description, 
+                          GPU usage, training_output, and timer is reported. 
+                          (default: None)                    
+        num_epochs (int): Number of epochs to run (default: 1)
+    Returns:
+        (lbann Model Object: A model object with the supplied callbacks, dataset
+                               presets, and graph kernels. 
+    '''   
+    
+    assert num_vertices != dataset #Ensure atleast one of the values is set 
+
+    if dataset is not None:
+        assert num_vertices is None
+
+        if dataset == 'MNIST':
+            num_vertices = 75
+            num_classes = 10
+            node_features = 1
+
+        elif dataset == 'PROTEINS':
+            num_vertices = 100
+            num_classes = 2
+            node_features = 3
+        else:
+            raise Exception("Unkown Dataset")
+
+    assert num_vertices is not None
+    assert num_classes is not None 
+    assert node_features is not None 
+    
+
+    #----------------------------------
+    # Reshape and Slice Input Tensor 
+    #----------------------------------
+
+    input_ = lbann.Input(target_mode = 'classification')
+
+    # Input dimensions should be (num_vertices * node_features + num_vertices^2 + num_classes )
+    # input should have atleast two children since the target is classification 
+    
+    sample_dims = num_vertices*node_features + (num_vertices ** 2) + num_classes
+    graph_dims = num_vertices*node_features + (num_vertices ** 2)
+    feature_matrix_size = num_vertices * node_features 
+   
+    graph_input = lbann.Slice(input_, axis = 0 , 
+                              slice_points = str_list([0,feature_matrix_size,graph_dims, sample_dims]),
+                              name = "Graph_Input") 
+
+    
+    feature_matrix = lbann.Reshape(graph_input, 
+                                   dims = str_list([num_vertices, node_features]), 
+                                   name="Node_features")
+    
+    adj_matrix = lbann.Reshape(graph_input,
+                               dims = str_list([num_vertices,num_vertices]), 
+                               name="Adj_Mat") 
+
+    target = lbann.Identity(graph_input, name="Target")
+    target = lbann.Reshape(target, dims=str(num_classes))    
+   
+    #----------------------------------
+    # Perform Graph Convolution
+    #----------------------------------
+    
+    if kernel_type == 'GCN':
+        x = DGCN_layer(feature_matrix, adj_matrix, node_features)
+    elif kernel_type == 'Graph':
+        x = DGraph_Layer(feature_matrix, adj_matrix, node_features)
+    else:
+        ValueError('Invalid Graph kernel specifier "{}" recieved. Expected one of:\
+                    GCN or Graph'.format(kernel_type)) 
+    out_channel = 256    
+    #----------------------------------
+    # Apply Reduction on Node Features
+    #----------------------------------
+
+    average_vector = lbann.Constant(value = 1/num_vertices, num_neurons = str_list([1,num_vertices]), name="Average_Vector")
+    x = lbann.MatMul(average_vector,x, name="Node_Feature_Reduction") # X is now a vector with output_channel dimensions 
+    
+    x = lbann.Reshape(x, dims= str_list([out_channel]), name="Squeeze")
+    x = lbann.FullyConnected(x, num_neurons=256, name="hidden_layer_1")
+    x = lbann.Relu(x, name="hidden_layer_1_activation")
+    x = lbann.FullyConnected(x, num_neurons=num_classes, name="Output_Fully_Connected")
+    
+    #----------------------------------
+    # Loss Function and Accuracy s
+    #----------------------------------
+    
+    
+    probs = lbann.Softmax(x, name="Softmax")
+    loss = lbann.CrossEntropy(probs, target, name="Cross_Entropy_Loss")
+    accuracy = lbann.CategoricalAccuracy(probs, target, name="Accuracy")
+
+    layers = lbann.traverse_layer_graph(input_)
+    if callbacks is None:
+        print_model = lbann.CallbackPrintModelDescription() #Prints initial Model after Setup
+        training_output = lbann.CallbackPrint( interval = 1,
+                           print_global_stat_only = False) #Prints training progress
+        gpu_usage = lbann.CallbackGPUMemoryUsage()
+        timer = lbann.CallbackTimer()
+        callbacks = [print_model, training_output, gpu_usage, timer]
+    else:
+        if isinstance (callbacks, list):
+            callbacks = callbacks   
+    metrics = [lbann.Metric(accuracy, name='accuracy', unit="%")]
+
+    model = lbann.Model(num_epochs, 
+                       layers = layers,
+                       objective_function = loss,
+                       metrics = metrics, 
+                       callbacks = callbacks
+                       )
+    return model
+
+
+if __name__ == '__main__':
+    model = make_model(dataset="MNIST")
+    model = make_model(dataset="MNIST", kernel_type = 'Graph') 
diff --git a/applications/graph/GNN/README.md b/applications/graph/GNN/README.md
new file mode 100644
index 00000000000..1705f0a5e76
--- /dev/null
+++ b/applications/graph/GNN/README.md
@@ -0,0 +1,48 @@
+## LBANNs Implementation of Graph Convolutional Kernels 
+This directory contains models which use graph convolution kernels. The graph sub-module in lbann.modules enables 
+geometric deep learning on LBANN. 
+
+## Datasets
+The datasets used to test the graph layers are: 
+
+1. MNIST Superpixel 
+2. PROTEINS
+
+To automatically download the MNIST Superpixel dataset: 
+
+```
+cd data/MNIST_Superpixel
+python3 MNIST_Superpixel_Dataset.py
+```
+
+To add self loops and normalize the adjacency matrix, run: 
+
+```
+python3 update_adj_mat.py
+```
+
+To automatically download the PROTEINS dataset: 
+```
+cd data/PROTEINS
+python3 PROTEINS_Dataset.py
+```
+
+Note: Both datasets require significant amount of preprocessing post download, so 
+the download and processing step should be run using the scheduler. 
+
+
+## Running Instructions 
+To run the a model with a graph kernel and a dataset: 
+
+```
+python3 main.py --dataset (Proteins/MNIST) --model (GCN/GIN/GRAPH/GATEDGRAPH) --mini-batch-size MB --num-epochs N
+
+```
+
+
+## Links 
+
+- Li, Yujia, et al. "Gated graph sequence neural networks." arXiv preprint arXiv:1511.05493 (2015).
+- Kipf, Thomas N., and Max Welling. "Semi-supervised classification with graph convolutional networks." arXiv preprint arXiv:1609.02907 (2016).
+- Xu, Keyulu, et al. "How powerful are graph neural networks?." arXiv preprint arXiv:1810.00826 (2018).
+- Morris, Christopher, et al. "Weisfeiler and leman go neural: Higher-order graph neural networks." Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 33. 2019.
diff --git a/applications/graph/GNN/Sparse_Graph_Trainer.py b/applications/graph/GNN/Sparse_Graph_Trainer.py
new file mode 100644
index 00000000000..2107f1ebf2d
--- /dev/null
+++ b/applications/graph/GNN/Sparse_Graph_Trainer.py
@@ -0,0 +1,245 @@
+import lbann 
+from lbann.util import str_list 
+from lbann.modules.graph import GINConv, GCNConv, GraphConv, GatedGraphConv
+from lbann.modules.graph import GraphVertexData
+from graph_data_util import lbann_Graph_Data
+
+def GINConvLayer(X,A):
+    """An example GIN kernel with 4 layer deep sequential nn.  
+    Args:
+        X (GraphVertexData): Contains all the node feaures of the graph 
+        A (Layer): Adjancency matrix layer. Should have the shape: 
+                   (num_nodes, num_nodes)
+    Returns: 
+        (GraphVertexData): Returns the new embedding of the node features 
+    """
+    FC = lbann.modules.FullyConnectedModule
+    sequential_nn = \
+                    [FC(128), 
+                     lbann.Relu, 
+                     FC(64),
+                     lbann.Relu,
+                     FC(32),
+                     lbann.Relu,
+                     FC(16),
+                     lbann.Relu]
+    out_channel = 16
+
+    gin = GINConv(sequential_nn, output_channels = out_channel)
+    return gin(X,A)
+
+
+def GCNConvLayer(X,A):
+    """An example 2-layer GCN kernel.
+    Args:
+        X (GraphVertexData): Contains all the node feaures of the graph
+        A (Layer): Adjancency matrix layer. Should have the shape: 
+                   (num_nodes, num_nodes)
+    Returns: 
+        (GraphVertexData): Returns the new embedding of the node features
+    """
+    input_channels_1 = X.shape[1]
+    out_channels_1 = 8
+    input_channels_2 = out_channels_1
+    out_channels_2 = 16
+
+    gcn_1 = GCNConv(input_channels_1,out_channels_1,
+                    bias = True,
+                    activation = lbann.Relu,
+                    name = 'GCN_1',
+                    data_layout = 'data_parallel')
+    gcn_2 = GCNConv(input_channels_2,out_channels_2,
+                    bias = True, 
+                    activation = lbann.Relu,
+                    name = 'GCN_2',
+                    data_layout = 'data_parallel')
+    X = gcn_1(X,A)
+    return  gcn_2(X,A)
+
+   
+def GraphConvLayer(X,A):
+    """An example 2-layer Graph kernel.
+    Args:
+        X (GraphVertexData): Contains all the node feaures of the graph
+        A (Layer): Adjancency matrix layer. Should have the shape: 
+                   (num_nodes, num_nodes)
+    Returns: 
+        (GraphVertexData): Returns the new embedding of the node features
+    """
+    input_channels_1 = X.shape[1]
+    out_channels_1 = 8 
+    input_channels_2 = out_channels_1
+    out_channels_2 = 16
+    
+    graph_1 = GraphConv(input_channels_1, out_channels_1,
+                        bias = True,
+                        activation = lbann.Relu,
+                        name = 'Graph_kernel_1',
+                        data_layout = 'data_parallel')
+    graph_2 = GraphConv(input_channels_2, out_channels_2,
+                        bias = True,
+                        activation = lbann.Relu, 
+                        name = 'Graph_Kernel_2',
+                        data_layout = 'data_parallel')
+
+    X = graph_1(X,A)
+    return graph_2(X,A)
+
+def GATConvLayer(X,A):
+    """An example single layer GatedGraph kernel.
+    Args:
+        X (GraphVertexData): Contains all the node feaures of the graph
+        A (Layer): Adjancency matrix layer. Should have the shape: 
+                   (num_nodes, num_nodes)
+    Returns: 
+        (GraphVertexData): Returns the new embedding of the node features
+    """
+    
+    output_channels = 8
+    num_layers = 3
+    name = 'GatedGraph'
+    data_layout = 'data_parallel' 
+
+    graph_kernel = GatedGraphConv(output_channels,
+                                  num_layers = num_layers,
+                                  name = name, 
+                                  data_layout = data_layout)
+    return graph_kernel(X,A)
+
+def make_model(num_vertices = None, 
+               node_features = None, 
+               num_classes = None,
+               dataset = None,
+               kernel_type = 'GCN',
+               callbacks = None,
+               num_epochs = 1):
+    '''Construct a model DAG using one of the Graph Kernels
+
+    Args:
+        num_vertices (int): Number of vertices of each graph (default: None) 
+        node_features (int): Number of features per noded (default: None)
+        num_classes (int): Number of classes as targets (default: None)
+        dataset (str): Preset data set to use. Either a datset parameter has to be 
+                       supplied or all of num_vertices, node_features, and 
+                       num_classes have to be supplied. (default: None) 
+        kernel_type (str): Graph Kernel to use in model. Expected one of 
+                            GCN, GIN, Graph, or GatedGraph (deafult: GCN)
+        callbacks (list): Callbacks for the model. If set to None the model description, 
+                          GPU usage, training_output, and timer is reported. 
+                          (default: None)                    
+        num_epochs (int): Number of epochs to run (default: 1)
+    Returns:
+        (lbann Model Object: A model object with the supplied callbacks, dataset
+                               presets, and graph kernels. 
+    '''
+
+    assert num_vertices != dataset #Ensure atleast one of the values is set 
+
+    if dataset is not None:
+        assert num_vertices is None
+
+        if dataset == 'MNIST':
+            num_vertices = 75
+            num_classes = 10
+            node_features = 1
+
+        elif dataset == 'PROTEINS':
+            num_vertices = 100
+            num_classes = 2
+            node_features = 3
+        else:
+            raise Exception("Unkown Dataset")
+
+    assert num_vertices is not None
+    assert num_classes is not None 
+    assert node_features is not None 
+
+    #----------------------------------
+    # Reshape and Slice Input Tensor 
+    #----------------------------------
+
+    input_ = lbann.Input(target_mode = 'classification')
+
+    # Input dimensions should be (num_vertices * node_features + num_vertices^2 + num_classes )    
+    # Input should have atleast two children since the target is classification 
+    
+    data = lbann_Graph_Data(input_,num_vertices, node_features,num_classes)
+    
+    feature_matrix = data.x 
+    adj_matrix = data.adj 
+    target = data.y 
+   
+    #----------------------------------
+    # Perform Graph Convolution
+    #----------------------------------
+
+    if kernel_type == 'GIN':
+        x = GINConvLayer(feature_matrix, adj_matrix) 
+    elif kernel_type == 'GCN':
+        x = GCNConvLayer(feature_matrix, adj_matrix)
+    elif kernel_type == 'Graph':
+        x = GraphConvLayer(feature_matrix, adj_matrix) 
+    elif kernel_type == 'GatedGraph':
+        x = GATConvLayer(feature_matrix, adj_matrix) 
+    else:
+        ValueError('Invalid Graph kernel specifier "{}" recieved. Expected one of:\
+                    GIN,GCN,Graph or GatedGraph'.format(kernel_type))
+    
+    out_channel = x.shape[1]
+    #----------------------------------
+    # Apply Reduction on Node Features
+    #----------------------------------
+
+    average_vector = lbann.Constant(value = 1/num_vertices, 
+                                    num_neurons = str_list([1,num_vertices]),
+                                    name="Average_Vector")
+    x = x.get_mat(out_channel)
+    
+    x = lbann.MatMul(average_vector,x, name="Node_Feature_Reduction") 
+    
+    # X is now a vector with output_channel dimensions 
+    
+    x = lbann.Reshape(x, dims = str_list([out_channel]), name = "Squeeze")
+    x = lbann.FullyConnected(x, num_neurons = 64, name = "hidden_layer_1")
+    x = lbann.Relu(x, name = "hidden_layer_1_activation")
+    x = lbann.FullyConnected(x, num_neurons = num_classes,
+                                name="Output_Fully_Connected")
+    
+    #----------------------------------
+    # Loss Function and Accuracy s
+    #----------------------------------
+    
+    
+    probs = lbann.Softmax(x, name="Softmax")
+    loss = lbann.CrossEntropy(probs, target, name="Cross_Entropy_Loss")
+    accuracy = lbann.CategoricalAccuracy(probs, target, name="Accuracy")
+
+    layers = lbann.traverse_layer_graph(input_)
+    
+    if callbacks is None:
+        print_model = lbann.CallbackPrintModelDescription() #Prints initial Model after Setup
+        training_output = lbann.CallbackPrint( interval = 1,
+                           print_global_stat_only = False) #Prints training progress
+        gpu_usage = lbann.CallbackGPUMemoryUsage()
+        timer = lbann.CallbackTimer()
+        callbacks = [print_model, training_output, gpu_usage, timer]
+    else:
+        if isinstance (callbacks, list):
+            callbacks = callbacks
+
+    metrics = [lbann.Metric(accuracy, name='accuracy', unit="%")]
+
+    model = lbann.Model(num_epochs, 
+                       layers = layers,
+                       objective_function = loss,
+                       metrics = metrics, 
+                       callbacks = callbacks
+                       )
+    return model
+
+if __name__ == '__main__':
+    # Quick check to see if model generates correctly
+    model_1 = make_model(dataset="MNIST", kernel_type = 'GIN')
+    model_1 = make_model(dataset="MNIST", kernel_type = 'GCN')
+    model_1 = make_model(dataset="MNIST", kernel_type = 'Graph')
+    model_1 = make_model(dataset="MNIST", kernel_type = 'GatedGraph')
diff --git a/applications/graph/GNN/data/MNIST_Superpixel/__init__.py b/applications/graph/GNN/data/MNIST_Superpixel/__init__.py
new file mode 100644
index 00000000000..99301eab715
--- /dev/null
+++ b/applications/graph/GNN/data/MNIST_Superpixel/__init__.py
@@ -0,0 +1,24 @@
+import urllib.request
+import tarfile 
+import os
+import os.path
+
+import lbann 
+ 
+data_dir = os.path.dirname(os.path.realpath(__file__))
+def make_data_reader(): #TO DO: Extend this to use this for validation / test set as well after testing 
+
+    reader = lbann.reader_pb2.DataReader()
+    _reader = reader.reader.add()
+    _reader.name = 'python'
+    _reader.role = 'train'
+    _reader.shuffle = False #Turn off shuffle for debugging 
+    _reader.percent_of_data_to_use = 1.0 
+    _reader.python.module = 'MNIST_Superpixel_Dataset'
+    _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__))
+    _reader.python.sample_function = 'get_train'
+    _reader.python.num_samples_function = 'num_train_samples' 
+    _reader.python.sample_dims_function = 'sample_dims' 
+
+    return reader 
+
diff --git a/applications/graph/GNN/data/MNIST_Superpixel/update_adj_mat.py b/applications/graph/GNN/data/MNIST_Superpixel/update_adj_mat.py
new file mode 100644
index 00000000000..31f31123b15
--- /dev/null
+++ b/applications/graph/GNN/data/MNIST_Superpixel/update_adj_mat.py
@@ -0,0 +1,10 @@
+import numpy as np
+
+adj_mats = np.load('adj_matrices.npy')
+
+num_data = adj_mats.shape[0]
+for adj in range(num_data):
+    print(adj, " / ", num_data)
+    deg_inv_sqrt = (adj_mats[adj].sum(axis=-1).clip(min=1)**(-0.5)).reshape(len(adj_mats[adj]),1)
+    adj_mats[adj] =deg_inv_sqrt*adj_mats[adj]*deg_inv_sqrt
+np.save('adj_matrices.npy', adj_mats)
diff --git a/applications/graph/GNN/data/MNIST_Superpixel/utils.py b/applications/graph/GNN/data/MNIST_Superpixel/utils.py
new file mode 100644
index 00000000000..bdc0256b464
--- /dev/null
+++ b/applications/graph/GNN/data/MNIST_Superpixel/utils.py
@@ -0,0 +1,100 @@
+import torch 
+import urllib.request
+import tarfile 
+import os
+import os.path
+import numpy as np
+import lbann 
+ 
+data_dir = os.path.dirname(os.path.realpath(__file__))
+
+def download_data():
+    url = "http://ls7-www.cs.uni-dortmund.de/cvpr_geometric_dl/mnist_superpixels.tar.gz"
+    training_name = "training.pt"
+    test_name = "test.pt"
+
+    files = [training_name, test_name]
+
+    for f in files:
+        data_file = os.path.join(data_dir, f)
+
+        if not os.path.isfile(data_file): #File not in directory 
+            tar_name = os.path.join(data_dir, "mnist_superpixel.tar.gz")
+
+            if not os.path.isfile(tar_name):
+                urllib.request.urlretrieve(url, filename=tar_name)
+                extract_data()
+            else:
+                extract_data()
+
+def extract_data():
+     tar_name = os.path.join(data_dir, "mnist_superpixel.tar.gz") 
+     print(tar_name)
+     with tarfile.open(tar_name) as tar:
+        tar.extractall()
+        tar.close()
+def edge_list_to_dense(elist):
+    adj_mat = np.zeros((75,75), dtype=np.float)
+
+    ## elist should be of shape (2, num_edges) 
+
+    num_edges = elist.size(1)
+
+    for edge in range(num_edges):
+        source, sink = elist[:,edge]
+        source = source.item()
+        sink = sink.item()
+        adj_mat[source][sink] = 1.0
+        adj_mat[sink][source] = 1.0
+    
+    return adj_mat
+
+def process_training_data(): # Process Training File
+    train_file_path = os.path.join(data_dir, 'training.pt')
+    #test_file_path = os.path.join(data_dir, 'test.pt')
+    
+    node_features, edge_index, edge_slices, positions, y = torch.load(train_file_path)
+    
+    assert y.size(0) == node_features.size(0)
+    assert y.size(0) == positions.size(0)
+    assert y.size(0) == 60000 ## 
+
+    num_data = 60000
+    num_vertices = 75
+        # Nodes features should be (60000, 75)
+        
+    node_features = np.float32(node_features)
+        
+        # Position should be (60000, 75, 2)
+
+    positions = np.float32(positions)
+
+        # Convert edge_index to edge matrix representation with shape (60000, 75, 75)
+        
+    adj_matrices = np.zeros( (num_data, num_vertices, num_vertices), dtype=np.float)
+
+    #assert (self.num_data + 1) == edge_slices.size(0), "Expected: {}, Got{} ".format(60001, edge_slices.size(0))
+        
+    for slice_index in range(num_data):
+        print("{}/{} completed \r".format(slice_index+1, num_data), end='',flush=True)
+        start_index = edge_slices[slice_index]
+        end_index = edge_slices[slice_index + 1]
+
+        graph_num = slice_index
+        elist = edge_index[:, start_index: end_index ]
+
+        adj_matrices[graph_num] = edge_list_to_dense(elist)
+
+
+        # Convert y to target with one hot encoding and shape (60000, 10)
+
+        targets = np.zeros ( (num_data, 10), dtype=np.float)
+
+    for i, target in enumerate(y):
+        print("{}/{} completed".format(i+1, len(y)), end='') 
+        targets[i][target] = 1
+
+    np.save('node_features.npy',node_features)
+    np.save('positions.npy',positions)
+    np.save('adj_matrices.npy', adj_matrices)
+    np.save('targets.npy',targets)
diff --git a/applications/graph/GNN/data/PROTEINS/PROTEINS_Dataset.py b/applications/graph/GNN/data/PROTEINS/PROTEINS_Dataset.py
new file mode 100644
index 00000000000..b49103efacc
--- /dev/null
+++ b/applications/graph/GNN/data/PROTEINS/PROTEINS_Dataset.py
@@ -0,0 +1,76 @@
+import numpy as np 
+import os 
+import os.path 
+import sys 
+import utils
+import numpy as np 
+import sys 
+
+files = ['node_features.npy', 'adj_mats.npy', 'targets.npy']
+
+data_dir = os.path.dirname(os.path.realpath(__file__))
+
+class PROTEINS_Dataset:
+    def __init__(self):
+        # Check is data is downloaded and processed
+        # Load if data exists 
+        # Else Download and process data  
+        for npy_file in files:
+            if not os.path.isfile(os.path.join(data_dir,"PROTEINS/"+npy_file)):
+                self.process_data()
+
+        self.node_features = np.load(os.path.join(data_dir, "PROTEINS/"+files[0]))
+        self.adjs = np.load(os.path.join(data_dir,"PROTEINS/"+files[1]))
+        self.targets = np.load(os.path.join(data_dir, "PROTEINS/"+files[2]))
+       
+    def generate_dataset(self):
+        global data_dir
+        print(data_dir)
+        data_dir = os.path.join(data_dir, 'PROTEINS')        
+        node_features, adj_mat, targets = utils.TUDataset_Parser(data_dir, 'PROTEINS', 2)
+        np.save(os.path.join(data_dir, files[0]), node_features)
+        np.save(os.path.join(data_dir, files[1]), adj_mat)
+        np.save(os.path.join(data_dir, files[2]), targets)
+
+    def process_data(self):
+        if not os.path.isfile(os.path.join(data_dir, "PROTEINS.zip")):
+            #Needs Download
+            url = 'https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/PROTEINS.zip'
+            save_path = os.path.join(data_dir, 'PROTEINS.zip')
+            utils.download_url(url, save_path)
+        utils.unzip_file(os.path.join(data_dir, "PROTEINS.zip"))
+        
+        self.generate_dataset()
+
+    def __len__(self):
+        
+        return len(self.node_features)
+    def __getitem__(self, index):
+        
+        x = np.float32(self.node_features[index].flatten())
+        y = np.float32(self.targets[index].flatten())
+        adj = np.float32(self.adjs[index].flatten())
+
+        return np.concatenate((x,adj,y), axis=0)
+
+training_data = PROTEINS_Dataset()
+
+def get_train(index):
+    return training_data[index]
+
+def num_train_samples():
+    return len(training_data)
+
+def sample_dims():
+    adjacency_matrix_size = 100 * 100 
+    node_feature_size = 100 * 3 
+    target_size = 2
+    return (adjacency_matrix_size + node_feature_size + target_size, )
+
+if __name__== '__main__':
+    print(len(training_data))
+    print(training_data.node_features[0].shape)
+    print(training_data.adjs[0].shape)
+    print(training_data.targets[0].shape)
+    print(type(training_data[0][0]))
+    print(sys.getsizeof(training_data[0][0]))
diff --git a/applications/graph/GNN/data/PROTEINS/__init__.py b/applications/graph/GNN/data/PROTEINS/__init__.py
new file mode 100644
index 00000000000..6204311b693
--- /dev/null
+++ b/applications/graph/GNN/data/PROTEINS/__init__.py
@@ -0,0 +1,24 @@
+import urllib.request
+import tarfile 
+import os
+import os.path
+
+import lbann 
+ 
+data_dir = os.path.dirname(os.path.realpath(__file__))
+def make_data_reader(): #TO DO: Extend this to use this for validation / test set as well after testing 
+
+    reader = lbann.reader_pb2.DataReader()
+    _reader = reader.reader.add()
+    _reader.name = 'python'
+    _reader.role = 'train'
+    _reader.shuffle = True #Turn off shuffle for debugging 
+    _reader.percent_of_data_to_use = 1.0 
+    _reader.python.module = 'PROTEINS_Dataset'
+    _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__))
+    _reader.python.sample_function = 'get_train'
+    _reader.python.num_samples_function = 'num_train_samples' 
+    _reader.python.sample_dims_function = 'sample_dims' 
+
+    return reader 
+
diff --git a/applications/graph/GNN/data/PROTEINS/utils.py b/applications/graph/GNN/data/PROTEINS/utils.py
new file mode 100644
index 00000000000..83f18fdc673
--- /dev/null
+++ b/applications/graph/GNN/data/PROTEINS/utils.py
@@ -0,0 +1,126 @@
+import urllib.request
+import tarfile
+import zipfile
+import os.path
+import numpy as np 
+
+def download_url(url, save_path):
+    with urllib.request.urlopen(url) as dl_file:
+        with open(save_path, 'wb') as out_file:
+            out_file.write(dl_file.read()) 
+
+def untar_file(data_dir, file_name):
+    tar_name = os.path.join(data_dir, file_name)        
+    with tarfile.open(tar_name) as tar:
+            tar.extractall()
+            tar.close()
+def unzip_file(file_name, data_dir=None):
+    if (data_dir is None):
+        data_dir = os.path.dirname(file_name)
+                
+    with zipfile.ZipFile(file_name, 'r') as zip_ref:
+        zip_ref.extractall(data_dir)
+
+def edge_list_to_dense(elist, num_vertices = 75):
+    adj_mat = np.zeros((num_vertices,num_vertices), dtype=np.float)
+    num_edges = elist.shape[0]
+    for edge in range(num_edges):
+        source, sink = elist[edge,:]
+        source = source.item()
+        sink = sink.item()
+        adj_mat[source][sink] = 1.0 
+        adj_mat[sink][source] = 1.0
+    return adj_mat
+
+
+########################################################
+#
+# TU Dataset specific functions
+#
+########################################################
+
+def extract_node_features(node_slices, node_labels, max_nodes, num_classes = None):
+    node_label_list = [] 
+    for i, ind in enumerate(node_slices[1:]):
+        if num_classes:
+            graph_x = np.eye(num_classes)[np.asarray([int(x) for x in node_labels[node_slices[i]:ind]],dtype=np.int)]
+        else:
+            graph_x = anp.asarray([int(x) for x in node_labels[node_slices[i]:ind]],dtype=np.int)
+        if (len(graph_x) < max_nodes):
+            pad = max_nodes - len(graph_x)
+            graph_x = np.pad(graph_x, ((0,pad),(0,0)), 'constant')
+            node_label_list.append(graph_x)
+    return node_label_list 
+
+
+def extract_adj_mat(node_slices, edge_list, max_nodes):
+    adj_mat_list = []
+    removed_graphs = []
+    for i, max_node_id in enumerate(node_slices[1:]):
+        min_node_id = node_slices[i]
+        num_nodes = max_node_id - min_node_id
+        if (num_nodes < max_nodes):
+            edges = edge_list[(edge_list[:,1] > min_node_id) & (edge_list[:,1] < max_node_id)]
+            edges = edges -1 - min_node_id 
+            adj_mat = edge_list_to_dense(edges, max_nodes)
+            adj_mat_list.append(adj_mat)
+        else:
+            removed_graphs.append(i)
+            
+    return adj_mat_list, removed_graphs
+
+def extract_targets(graph_labels, num_classes, removed_graphs):
+    graph_labels = np.array([int(x) for x in graph_labels])
+    labels = np.eye(num_classes)[graph_labels-1]
+    graph_labels =  np.delete(labels, removed_graphs, axis=0)
+    return graph_labels
+
+def dataset_node_slices(graph_indicator_list, num_graphs):
+    node_slices = []
+    
+    prev = 0
+    for i in range(num_graphs+1):
+        node_slices.append(prev+graph_indicator_list.count(str(i)))
+        prev = prev + graph_indicator_list.count(str(i))
+    return node_slices
+
+def TUDataset_Parser(data_dir, dataset_name, num_classes):
+        
+    adj_file = open(os.path.join(data_dir, dataset_name + '_A.txt'), 'r')
+    graph_labels_file = open(os.path.join( data_dir, dataset_name + '_graph_labels.txt'), 'r')
+    graph_ind_file = open(os.path.join( data_dir, dataset_name + '_graph_indicator.txt'), 'r')    
+    node_attr_file = open(os.path.join( data_dir, dataset_name + '_node_attributes.txt'), 'r')
+    node_labels_file = open(os.path.join( data_dir, dataset_name + '_node_labels.txt'), 'r')
+
+    graph_labels = graph_labels_file.read().rstrip().split('\n')
+    graph_ind = graph_ind_file.read().rstrip().split('\n')
+    node_attr = node_attr_file.read().rstrip().split('\n')
+    adj_list = adj_file.read().rstrip().split('\n')
+    node_labels = node_labels_file.read().rstrip().split('\n')
+
+    NUM_GRAPHS =  len(graph_labels)
+    NUM_NODES = len(node_attr)
+    NUM_EDGES = len(adj_list)
+
+    adj_file.close()
+    graph_labels_file.close()
+    graph_ind_file.close()
+    node_attr_file.close()
+    node_labels_file.close()
+    edge_list = [] 
+    for edge in adj_list:
+        edge = np.array([int(x) for x in edge.split(',')])
+        edge_list.append(edge)
+
+    edge_list = np.array(edge_list)
+
+    node_slices = dataset_node_slices(graph_ind, NUM_GRAPHS)
+    
+    max_nodes = 100
+    adj_mat, removed_graphs = extract_adj_mat(node_slices, edge_list, max_nodes)
+    num_features = 3
+    node_features = extract_node_features(node_slices, node_labels,max_nodes, num_features)
+    node_features = np.array(node_features)
+    targets = extract_targets(graph_labels, num_classes, removed_graphs)
+
+    return node_features, adj_mat, targets
diff --git a/applications/graph/GNN/data/__init__.py b/applications/graph/GNN/data/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/applications/graph/GNN/main.py b/applications/graph/GNN/main.py
new file mode 100644
index 00000000000..cebf7c679a7
--- /dev/null
+++ b/applications/graph/GNN/main.py
@@ -0,0 +1,91 @@
+import lbann 
+import lbann.contrib.launcher 
+import lbann.contrib.args
+
+import argparse 
+import os 
+
+import Sparse_Graph_Trainer 
+import Dense_Graph_Trainer
+import data.MNIST_Superpixel
+import data.PROTEINS
+
+desc = (" Training a Graph Convolutional Model using LBANN" )
+
+parser = argparse.ArgumentParser(description=desc)
+
+lbann.contrib.args.add_scheduler_arguments(parser) 
+lbann.contrib.args.add_optimizer_arguments(parser) 
+
+parser.add_argument(
+    '--num-epochs', action='store', default=100, type=int,
+    help='number of epochs (deafult: 100)', metavar='NUM')
+
+parser.add_argument(
+    '--mini-batch-size', action='store',default=32, type=int,
+    help="mini-batch size (default: 32)", metavar='NUM')
+
+parser.add_argument(
+    '--dataset', action='store', default='MNIST', type=str,
+    help="Dataset for model (default: MNIST)", metavar='NAME')
+
+parser.add_argument(
+    '--job-name', action='store', default="GCN_TEST", type=str,
+    help="Job name for scheduler", metavar='NAME')
+
+parser.add_argument(
+    '--model', action = 'store', default='GCN', type=str,
+    help="The type of model to use", metavar='NAME')
+
+args = parser.parse_args()
+
+
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args) 
+
+dataset = args.dataset
+num_epochs = args.num_epochs
+mini_batch_size = args.mini_batch_size 
+job_name = args.job_name
+model_arch = args.model
+
+
+## Get Model
+
+if (model_arch == 'GRAPH'):
+    model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                            kernel_type = 'Graph',
+                                            num_epochs = num_epochs)
+elif(model_arch=='GIN'):
+    model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                            kernel_type = 'GIN',
+                                            num_epochs = num_epochs)
+elif(model_arch=='GATEDGRAPH'):
+    model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                            kernel_type = 'GatedGraph',
+                                            num_epochs = num_epochs)
+elif (model_arch =='DGCN'):
+    model = Dense_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                           kernel_type = 'GCN',
+                                           num_epochs = num_epochs)
+elif (model_arch == 'DGRAPH'):
+    model = Dense_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                           kernel_type = 'Graph',
+                                           num_epochs = num_epochs)
+else:   
+    model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                            kernel_type = 'GCN',
+                                            num_epochs=num_epochs)
+
+
+optimizer = lbann.SGD(learn_rate = 1e-3)
+
+#add logic for choosing a dataset 
+
+data_reader = data.PROTEINS.make_data_reader()
+
+trainer = lbann.Trainer(mini_batch_size = mini_batch_size)
+
+
+lbann.contrib.launcher.run(trainer, model, data_reader, optimizer,
+                           job_name = job_name,
+                           **kwargs)
diff --git a/applications/graph/GNN/test/__init__.py b/applications/graph/GNN/test/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/applications/graph/GNN/test/conftest.py b/applications/graph/GNN/test/conftest.py
new file mode 100644
index 00000000000..0179df3687b
--- /dev/null
+++ b/applications/graph/GNN/test/conftest.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../../../../bamboo/common_python')
+import tools
+import pytest, re, subprocess
+
+
+def pytest_addoption(parser):
+    cluster = re.sub('[0-9]+', '', subprocess.check_output(
+        'hostname'.split()).decode('utf-8').strip())
+    default_dirname = subprocess.check_output(
+        'git rev-parse --show-toplevel'.split()).decode('utf-8').strip()
+    default_exes = tools.get_default_exes(default_dirname, cluster)
+
+    parser.addoption('--cluster', action='store', default=cluster,
+                     help='--cluster=<cluster> to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster')
+    parser.addoption('--dirname', action='store', default=default_dirname,
+                     help='--dirname=<path_to_dir> to specify the top-level directory. Default directory of build_lbann_lc executable')
+    parser.addoption('--exes', action='store', default=default_exes,
+                     help='--exes={compiler_name: path}')
+    parser.addoption('--weekly', action='store_true', default=False,
+                     help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False')
+
+
+@pytest.fixture
+def cluster(request):
+    return request.config.getoption('--cluster')
+
+
+@pytest.fixture
+def dirname(request):
+    return request.config.getoption('--dirname')
+
+
+@pytest.fixture
+def exes(request):
+    return request.config.getoption('--exes')
+
+
+@pytest.fixture
+def weekly(request):
+    return request.config.getoption('--weekly')
diff --git a/applications/graph/GNN/test/test_integration_DGCN.py b/applications/graph/GNN/test/test_integration_DGCN.py
new file mode 100644
index 00000000000..b94489483fd
--- /dev/null
+++ b/applications/graph/GNN/test/test_integration_DGCN.py
@@ -0,0 +1,153 @@
+import functools 
+import operator 
+import os 
+import os.path 
+import re
+import sys
+import pytest
+import lbann 
+
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+root_dir = os.path.dirname(current_dir)
+
+import data.PROTEINS
+import Dense_Graph_Trainer
+
+graph_dir = os.path.dirname(root_dir)
+applications_dir = os.path.dirname(graph_dir)
+lbann_dir = os.path.dirname(applications_dir)
+common_python_dir = os.path.join(lbann_dir, 'bamboo/common_python')# Added lbann/bamboo/common_python 
+sys.path.append(common_python_dir)
+import tools
+
+
+num_epochs = 30
+mini_batch_size = 64
+num_nodes = 2
+
+
+expected_accuracy_range = (64, 71)
+
+expected_mini_batch_times = {
+       'ray' : 0.005
+       }
+expected_gpu_usage = {
+        'ray' : 0.7
+        }
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment. 
+
+    args: 
+        lbann (module): Module for LBANN Python frontend
+        
+    """
+
+    
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+
+
+
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackGPUMemoryUsage()]
+    
+
+    
+    model = Dense_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                            kernel_type = 'GCN',
+                                            num_epochs = num_epochs,
+                                            callbacks = callbacks)
+    reader = data.PROTEINS.make_data_reader()
+    
+    # No validation set
+
+    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 )
+    return trainer, model, reader, optimizer
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def augment_test_func(test_func):
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname):
+        # Run LBANN experiment
+        experiment_output = test_func(cluster, exes, dirname)
+
+        # Parse LBANN log file
+        train_accuracy = None
+        gpu_usage = None
+        mini_batch_times = []
+        gpu_usages = []
+
+        with open(experiment_output['stdout_log_file']) as f:
+            for line in f:
+                match = re.search('training epoch [0-9]+ accuracy : ([0-9.]+)%', line)
+                if match:
+                    train_accuracy = float(match.group(1))
+                match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line)
+                if match:
+                    mini_batch_times.append(float(match.group(1)))
+                match = re.search('GPU memory usage statistics : ([0-9.]+) GiB mean', line)
+                if match:
+                    gpu_usages.append(float(match.group(1)))
+                    
+        # Check if training accuracy is within expected range
+        assert (expected_accuracy_range[0]
+                < train_accuracy
+                <expected_accuracy_range[1]), \
+                'train accuracy is outside expected range'
+       
+        #Only tested on Ray. Skip if mini-batch test on another cluster. Change this when mini-batch values are available for other clusters 
+
+        if (cluster == 'ray'):
+        # Check if mini-batch time is within expected range
+        # Note: Skip first epoch since its runtime is usually an outlier
+            mini_batch_times = mini_batch_times[1:]
+            mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
+            assert (0.75 * expected_mini_batch_times[cluster]
+                    < mini_batch_time
+                    < 1.25 * expected_mini_batch_times[cluster]), \
+                    'average mini-batch time is outside expected range'
+        # Check for GPU usage and memory leaks 
+        # Note: Skip first epoch 
+            gpu_usages = gpu_usages[1:] 
+            gpu_usage = sum(gpu_usages)/len(gpu_usages)
+
+            assert (0.75 * expected_gpu_usage[cluster] 
+                    < gpu_usage 
+                    < 1.25 * expected_gpu_usage[cluster]),\
+                    'average gpu usage is outside expected range'
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     nodes=num_nodes):
+    globals()[_test_func.__name__] = augment_test_func(_test_func)
+
diff --git a/applications/graph/GNN/test/test_integration_DGraph.py b/applications/graph/GNN/test/test_integration_DGraph.py
new file mode 100644
index 00000000000..57e37ffd239
--- /dev/null
+++ b/applications/graph/GNN/test/test_integration_DGraph.py
@@ -0,0 +1,153 @@
+import functools 
+import operator 
+import os 
+import os.path 
+import re
+import sys
+import pytest
+import lbann 
+
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+root_dir = os.path.dirname(current_dir)
+
+import data.PROTEINS
+import Dense_Graph_Trainer
+
+graph_dir = os.path.dirname(root_dir)
+applications_dir = os.path.dirname(graph_dir)
+lbann_dir = os.path.dirname(applications_dir)
+common_python_dir = os.path.join(lbann_dir, 'bamboo/common_python')# Added lbann/bamboo/common_python 
+sys.path.append(common_python_dir)
+import tools
+
+
+num_epochs = 30
+mini_batch_size = 64
+num_nodes = 2
+
+
+expected_accuracy_range = (64, 72)
+
+expected_mini_batch_times = {
+       'ray' : 0.007
+       }
+expected_gpu_usage = {
+        'ray' : 0.835
+        }
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment. 
+
+    args: 
+        lbann (module): Module for LBANN Python frontend
+        
+    """
+
+    
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+
+
+
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackGPUMemoryUsage()]
+    
+
+    
+    model = Dense_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                            kernel_type = 'Graph',
+                                            num_epochs = num_epochs,
+                                            callbacks = callbacks)
+    reader = data.PROTEINS.make_data_reader()
+    
+    # No validation set
+
+    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 )
+    return trainer, model, reader, optimizer
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def augment_test_func(test_func):
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname):
+        # Run LBANN experiment
+        experiment_output = test_func(cluster, exes, dirname)
+
+        # Parse LBANN log file
+        train_accuracy = None
+        gpu_usage = None
+        mini_batch_times = []
+        gpu_usages = []
+
+        with open(experiment_output['stdout_log_file']) as f:
+            for line in f:
+                match = re.search('training epoch [0-9]+ accuracy : ([0-9.]+)%', line)
+                if match:
+                    train_accuracy = float(match.group(1))
+                match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line)
+                if match:
+                    mini_batch_times.append(float(match.group(1)))
+                match = re.search('GPU memory usage statistics : ([0-9.]+) GiB mean', line)
+                if match:
+                    gpu_usages.append(float(match.group(1)))
+                    
+        # Check if training accuracy is within expected range
+        assert (expected_accuracy_range[0]
+                < train_accuracy
+                <expected_accuracy_range[1]), \
+                'train accuracy is outside expected range'
+       
+        #Only tested on Ray. Skip if mini-batch test on another cluster. Change this when mini-batch values are available for other clusters 
+
+        if (cluster == 'ray'):
+        # Check if mini-batch time is within expected range
+        # Note: Skip first epoch since its runtime is usually an outlier
+            mini_batch_times = mini_batch_times[1:]
+            mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
+            assert (0.75 * expected_mini_batch_times[cluster]
+                    < mini_batch_time
+                    < 1.25 * expected_mini_batch_times[cluster]), \
+                    'average mini-batch time is outside expected range'
+        # Check for GPU usage and memory leaks 
+        # Note: Skip first epoch 
+            gpu_usages = gpu_usages[1:] 
+            gpu_usage = sum(gpu_usages)/len(gpu_usages)
+
+            assert (0.75 * expected_gpu_usage[cluster] 
+                    < gpu_usage 
+                    < 1.25 * expected_gpu_usage[cluster]),\
+                    'average gpu usage is outside expected range'
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     nodes=num_nodes):
+    globals()[_test_func.__name__] = augment_test_func(_test_func)
+
diff --git a/applications/graph/GNN/test/test_integration_GCN.py b/applications/graph/GNN/test/test_integration_GCN.py
new file mode 100644
index 00000000000..e50191e7a14
--- /dev/null
+++ b/applications/graph/GNN/test/test_integration_GCN.py
@@ -0,0 +1,153 @@
+import functools 
+import operator 
+import os 
+import os.path 
+import re
+import sys
+import pytest
+import lbann 
+
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+root_dir = os.path.dirname(current_dir)
+
+import data.PROTEINS
+import Sparse_Graph_Trainer
+
+graph_dir = os.path.dirname(root_dir)
+applications_dir = os.path.dirname(graph_dir)
+lbann_dir = os.path.dirname(applications_dir)
+common_python_dir = os.path.join(lbann_dir, 'bamboo/common_python')# Added lbann/bamboo/common_python 
+sys.path.append(common_python_dir)
+import tools
+
+
+num_epochs = 100
+mini_batch_size = 64
+num_nodes = 2
+
+
+expected_accuracy_range = (58, 75)
+
+expected_mini_batch_times = {
+       'ray' : 0.04
+       }
+expected_gpu_usage = {
+        'ray' : 0.554
+        }
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment. 
+
+    args: 
+        lbann (module): Module for LBANN Python frontend
+        
+    """
+
+    
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+
+
+
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackGPUMemoryUsage()]
+    
+
+    
+    model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                            kernel_type = 'GCN',
+                                            num_epochs = num_epochs,
+                                            callbacks = callbacks)
+    reader = data.PROTEINS.make_data_reader()
+    
+    # No validation set
+
+    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 )
+    return trainer, model, reader, optimizer
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def augment_test_func(test_func):
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname):
+        # Run LBANN experiment
+        experiment_output = test_func(cluster, exes, dirname)
+
+        # Parse LBANN log file
+        train_accuracy = None
+        gpu_usage = None
+        mini_batch_times = []
+        gpu_usages = []
+
+        with open(experiment_output['stdout_log_file']) as f:
+            for line in f:
+                match = re.search('training epoch [0-9]+ accuracy : ([0-9.]+)%', line)
+                if match:
+                    train_accuracy = float(match.group(1))
+                match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line)
+                if match:
+                    mini_batch_times.append(float(match.group(1)))
+                match = re.search('GPU memory usage statistics : ([0-9.]+) GiB mean', line)
+                if match:
+                    gpu_usages.append(float(match.group(1)))
+                    
+        # Check if training accuracy is within expected range
+        assert (expected_accuracy_range[0]
+                < train_accuracy
+                <expected_accuracy_range[1]), \
+                'train accuracy is outside expected range'
+       
+        #Only tested on Ray. Skip if mini-batch test on another cluster. Change this when mini-batch values are available for other clusters 
+
+        if (cluster == 'ray'):
+        # Check if mini-batch time is within expected range
+        # Note: Skip first epoch since its runtime is usually an outlier
+            mini_batch_times = mini_batch_times[1:]
+            mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
+            assert (0.75 * expected_mini_batch_times[cluster]
+                    < mini_batch_time
+                    < 1.25 * expected_mini_batch_times[cluster]), \
+                    'average mini-batch time is outside expected range'
+        # Check for GPU usage and memory leaks 
+        # Note: Skip first epoch 
+            gpu_usages = gpu_usages[1:] 
+            gpu_usage = sum(gpu_usages)/len(gpu_usages)
+
+            assert (0.75 * expected_gpu_usage[cluster] 
+                    < gpu_usage 
+                    < 1.25 * expected_gpu_usage[cluster]),\
+                    'average gpu usage is outside expected range'
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     nodes=num_nodes):
+    globals()[_test_func.__name__] = augment_test_func(_test_func)
+
diff --git a/applications/graph/GNN/test/test_integration_GIN.py b/applications/graph/GNN/test/test_integration_GIN.py
new file mode 100644
index 00000000000..0430a5d1f8b
--- /dev/null
+++ b/applications/graph/GNN/test/test_integration_GIN.py
@@ -0,0 +1,153 @@
+import functools 
+import operator 
+import os 
+import os.path 
+import re
+import sys
+import pytest
+import lbann 
+
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+root_dir = os.path.dirname(current_dir)
+
+import data.PROTEINS
+import Sparse_Graph_Trainer
+
+graph_dir = os.path.dirname(root_dir)
+applications_dir = os.path.dirname(graph_dir)
+lbann_dir = os.path.dirname(applications_dir)
+common_python_dir = os.path.join(lbann_dir, 'bamboo/common_python')# Added lbann/bamboo/common_python 
+sys.path.append(common_python_dir)
+import tools
+
+
+num_epochs = 100
+mini_batch_size = 64
+num_nodes = 2
+
+
+expected_accuracy_range = (70, 80)
+
+expected_mini_batch_times = {
+       'ray' : 0.0792
+       }
+expected_gpu_usage = {
+        'ray' : 0.535
+        }
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment. 
+
+    args: 
+        lbann (module): Module for LBANN Python frontend
+        
+    """
+
+    
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+
+
+
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackGPUMemoryUsage()]
+    
+
+    
+    model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                            kernel_type = 'GIN',
+                                            num_epochs = num_epochs,
+                                            callbacks = callbacks)
+    reader = data.PROTEINS.make_data_reader()
+    
+    # No validation set
+
+    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 )
+    return trainer, model, reader, optimizer
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def augment_test_func(test_func):
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname):
+        # Run LBANN experiment
+        experiment_output = test_func(cluster, exes, dirname)
+
+        # Parse LBANN log file
+        train_accuracy = None
+        gpu_usage = None
+        mini_batch_times = []
+        gpu_usages = []
+
+        with open(experiment_output['stdout_log_file']) as f:
+            for line in f:
+                match = re.search('training epoch [0-9]+ accuracy : ([0-9.]+)%', line)
+                if match:
+                    train_accuracy = float(match.group(1))
+                match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line)
+                if match:
+                    mini_batch_times.append(float(match.group(1)))
+                match = re.search('GPU memory usage statistics : ([0-9.]+) GiB mean', line)
+                if match:
+                    gpu_usages.append(float(match.group(1)))
+                    
+        # Check if training accuracy is within expected range
+        assert (expected_accuracy_range[0]
+                < train_accuracy
+                <expected_accuracy_range[1]), \
+                'train accuracy is outside expected range'
+       
+        #Only tested on Ray. Skip if mini-batch test on another cluster. Change this when mini-batch values are available for other clusters 
+
+        if (cluster == 'ray'):
+        # Check if mini-batch time is within expected range
+        # Note: Skip first epoch since its runtime is usually an outlier
+            mini_batch_times = mini_batch_times[1:]
+            mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
+            assert (0.75 * expected_mini_batch_times[cluster]
+                    < mini_batch_time
+                    < 1.25 * expected_mini_batch_times[cluster]), \
+                    'average mini-batch time is outside expected range'
+        # Check for GPU usage and memory leaks 
+        # Note: Skip first epoch 
+            gpu_usages = gpu_usages[1:] 
+            gpu_usage = sum(gpu_usages)/len(gpu_usages)
+
+            assert (0.75 * expected_gpu_usage[cluster] 
+                    < gpu_usage 
+                    < 1.25 * expected_gpu_usage[cluster]),\
+                    'average gpu usage is outside expected range'
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     nodes=num_nodes):
+    globals()[_test_func.__name__] = augment_test_func(_test_func)
+
diff --git a/applications/graph/GNN/test/test_integration_GatedGraph.py b/applications/graph/GNN/test/test_integration_GatedGraph.py
new file mode 100644
index 00000000000..bf012c9da92
--- /dev/null
+++ b/applications/graph/GNN/test/test_integration_GatedGraph.py
@@ -0,0 +1,153 @@
+import functools 
+import operator 
+import os 
+import os.path 
+import re
+import sys
+import pytest
+import lbann 
+
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+root_dir = os.path.dirname(current_dir)
+
+import data.PROTEINS
+import Sparse_Graph_Trainer
+
+graph_dir = os.path.dirname(root_dir)
+applications_dir = os.path.dirname(graph_dir)
+lbann_dir = os.path.dirname(applications_dir)
+common_python_dir = os.path.join(lbann_dir, 'bamboo/common_python')# Added lbann/bamboo/common_python 
+sys.path.append(common_python_dir)
+import tools
+
+
+num_epochs = 60
+mini_batch_size = 128
+num_nodes = 2
+
+
+expected_accuracy_range = (65, 75)
+
+expected_mini_batch_times = {
+       'ray' : 0.05
+       }
+expected_gpu_usage = {
+        'ray' : 0.56
+        }
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment. 
+
+    args: 
+        lbann (module): Module for LBANN Python frontend
+        
+    """
+
+    
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+
+
+
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackGPUMemoryUsage()]
+    
+
+    
+    model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                            kernel_type = 'GCN',
+                                            num_epochs = num_epochs,
+                                            callbacks = callbacks)
+    reader = data.PROTEINS.make_data_reader()
+    
+    # No validation set
+
+    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 )
+    return trainer, model, reader, optimizer
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def augment_test_func(test_func):
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname):
+        # Run LBANN experiment
+        experiment_output = test_func(cluster, exes, dirname)
+
+        # Parse LBANN log file
+        train_accuracy = None
+        gpu_usage = None
+        mini_batch_times = []
+        gpu_usages = []
+
+        with open(experiment_output['stdout_log_file']) as f:
+            for line in f:
+                match = re.search('training epoch [0-9]+ accuracy : ([0-9.]+)%', line)
+                if match:
+                    train_accuracy = float(match.group(1))
+                match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line)
+                if match:
+                    mini_batch_times.append(float(match.group(1)))
+                match = re.search('GPU memory usage statistics : ([0-9.]+) GiB mean', line)
+                if match:
+                    gpu_usages.append(float(match.group(1)))
+                    
+        # Check if training accuracy is within expected range
+        assert (expected_accuracy_range[0]
+                < train_accuracy
+                <expected_accuracy_range[1]), \
+                'train accuracy is outside expected range'
+       
+        #Only tested on Ray. Skip if mini-batch test on another cluster. Change this when mini-batch values are available for other clusters 
+
+        if (cluster == 'ray'):
+        # Check if mini-batch time is within expected range
+        # Note: Skip first epoch since its runtime is usually an outlier
+            mini_batch_times = mini_batch_times[1:]
+            mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
+            assert (0.75 * expected_mini_batch_times[cluster]
+                    < mini_batch_time
+                    < 1.25 * expected_mini_batch_times[cluster]), \
+                    'average mini-batch time is outside expected range'
+        # Check for GPU usage and memory leaks 
+        # Note: Skip first epoch 
+            gpu_usages = gpu_usages[1:] 
+            gpu_usage = sum(gpu_usages)/len(gpu_usages)
+
+            assert (0.75 * expected_gpu_usage[cluster] 
+                    < gpu_usage 
+                    < 1.25 * expected_gpu_usage[cluster]),\
+                    'average gpu usage is outside expected range'
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     nodes=num_nodes):
+    globals()[_test_func.__name__] = augment_test_func(_test_func)
+
diff --git a/applications/graph/GNN/test/test_integration_Graph.py b/applications/graph/GNN/test/test_integration_Graph.py
new file mode 100644
index 00000000000..559af08143c
--- /dev/null
+++ b/applications/graph/GNN/test/test_integration_Graph.py
@@ -0,0 +1,153 @@
+import functools 
+import operator 
+import os 
+import os.path 
+import re
+import sys
+import pytest
+import lbann 
+
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+root_dir = os.path.dirname(current_dir)
+
+import data.PROTEINS
+import Sparse_Graph_Trainer
+
+graph_dir = os.path.dirname(root_dir)
+applications_dir = os.path.dirname(graph_dir)
+lbann_dir = os.path.dirname(applications_dir)
+common_python_dir = os.path.join(lbann_dir, 'bamboo/common_python')# Added lbann/bamboo/common_python 
+sys.path.append(common_python_dir)
+import tools
+
+
+num_epochs = 100
+mini_batch_size = 64
+num_nodes = 2
+
+
+expected_accuracy_range = (70, 80)
+
+expected_mini_batch_times = {
+       'ray' : 0.085
+       }
+expected_gpu_usage = {
+        'ray' : 0.542
+        }
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment. 
+
+    args: 
+        lbann (module): Module for LBANN Python frontend
+        
+    """
+
+    
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+
+
+
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackGPUMemoryUsage()]
+    
+
+    
+    model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS',
+                                            kernel_type = 'Graph',
+                                            num_epochs = num_epochs,
+                                            callbacks = callbacks)
+    reader = data.PROTEINS.make_data_reader()
+    
+    # No validation set
+
+    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 )
+    return trainer, model, reader, optimizer
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def augment_test_func(test_func):
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname):
+        # Run LBANN experiment
+        experiment_output = test_func(cluster, exes, dirname)
+
+        # Parse LBANN log file
+        train_accuracy = None
+        gpu_usage = None
+        mini_batch_times = []
+        gpu_usages = []
+
+        with open(experiment_output['stdout_log_file']) as f:
+            for line in f:
+                match = re.search('training epoch [0-9]+ accuracy : ([0-9.]+)%', line)
+                if match:
+                    train_accuracy = float(match.group(1))
+                match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line)
+                if match:
+                    mini_batch_times.append(float(match.group(1)))
+                match = re.search('GPU memory usage statistics : ([0-9.]+) GiB mean', line)
+                if match:
+                    gpu_usages.append(float(match.group(1)))
+                    
+        # Check if training accuracy is within expected range
+        assert (expected_accuracy_range[0]
+                < train_accuracy
+                <expected_accuracy_range[1]), \
+                'train accuracy is outside expected range'
+       
+        #Only tested on Ray. Skip if mini-batch test on another cluster. Change this when mini-batch values are available for other clusters 
+
+        if (cluster == 'ray'):
+        # Check if mini-batch time is within expected range
+        # Note: Skip first epoch since its runtime is usually an outlier
+            mini_batch_times = mini_batch_times[1:]
+            mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
+            assert (0.75 * expected_mini_batch_times[cluster]
+                    < mini_batch_time
+                    < 1.25 * expected_mini_batch_times[cluster]), \
+                    'average mini-batch time is outside expected range'
+        # Check for GPU usage and memory leaks 
+        # Note: Skip first epoch 
+            gpu_usages = gpu_usages[1:] 
+            gpu_usage = sum(gpu_usages)/len(gpu_usages)
+
+            assert (0.75 * expected_gpu_usage[cluster] 
+                    < gpu_usage 
+                    < 1.25 * expected_gpu_usage[cluster]),\
+                    'average gpu usage is outside expected range'
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     nodes=num_nodes):
+    globals()[_test_func.__name__] = augment_test_func(_test_func)
+
diff --git a/python/lbann/modules/__init__.py b/python/lbann/modules/__init__.py
index f590e2c52bb..7adccba18a1 100644
--- a/python/lbann/modules/__init__.py
+++ b/python/lbann/modules/__init__.py
@@ -9,3 +9,4 @@
 from lbann.modules.base import Module, FullyConnectedModule, ConvolutionModule, Convolution2dModule, Convolution3dModule
 from lbann.modules.rnn import LSTMCell, GRU
 from lbann.modules.transformer import MultiheadAttention
+from lbann.modules.graph import *
diff --git a/python/lbann/modules/graph/__init__.py b/python/lbann/modules/graph/__init__.py
new file mode 100644
index 00000000000..c987a56a23b
--- /dev/null
+++ b/python/lbann/modules/graph/__init__.py
@@ -0,0 +1,12 @@
+"""Graph neural network modules.
+
+Some common graph kernels for graph structured data commonly used for graph 
+convolutional networks.
+
+"""
+
+#import from sub modules 
+
+from lbann.modules.graph.utils import GraphVertexData
+from lbann.modules.graph.dense import DenseGCNConv, DenseGraphConv
+from lbann.modules.graph.sparse import GCNConv, GINConv, GraphConv, GatedGraphConv
diff --git a/python/lbann/modules/graph/dense/DenseGCNConv.py b/python/lbann/modules/graph/dense/DenseGCNConv.py
new file mode 100644
index 00000000000..1d7f557ac28
--- /dev/null
+++ b/python/lbann/modules/graph/dense/DenseGCNConv.py
@@ -0,0 +1,27 @@
+import lbann 
+from  lbann.modules import Module 
+from lbann.util import str_list
+import math 
+
+
+class DenseGCNConv(Module):
+    global_count = 0
+
+    def __init__(self, input_channels, output_channels, name=None):
+        super().__init__()
+        DenseGCNConv.global_count += 1
+
+        self.name = (name if name else 'Dense_GCN_{}'.format(DenseGCNConv.global_count))
+        
+        
+        bounds = math.sqrt(6.0 / (input_channels + output_channels))
+        self.weights = lbann.Weights(initializer = lbann.UniformInitializer(min=-bounds,max=bounds),
+                                    name=self.name+'_Weights')
+
+        self.W = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]),
+                               name = self.name+'_layer',
+                               weights =self.weights)
+    def forward(self,X,A):
+        out = lbann.MatMul(X,self.W, name=self.name+'_weight_mult')
+        out = lbann.MatMul(A, out, name=self.name+'_adj_mult')
+        return out
diff --git a/python/lbann/modules/graph/dense/DenseGraphConv.py b/python/lbann/modules/graph/dense/DenseGraphConv.py
new file mode 100644
index 00000000000..a2baddc5448
--- /dev/null
+++ b/python/lbann/modules/graph/dense/DenseGraphConv.py
@@ -0,0 +1,34 @@
+import lbann
+from lbann.modules import Module 
+from lbann.util import str_list
+import math
+
+class DenseGraphConv(Module):
+    global_count = 0 
+    def __init__(self, input_channels, output_channels, name=None):
+        super().__init__()
+        self.name = (name if name else 'DenseGraph_{}'.format(DenseGraphConv.global_count))
+        
+        DenseGraphConv.global_count+=1                        
+        
+        bounds = math.sqrt(6.0/(input_channels + output_channels))
+
+        self.weights_1 = lbann.Weights(initializer = lbann.UniformInitializer(min=-bounds, max=bounds),
+                                    name=self.name+'_Weights_1')
+        self.weights_2 = lbann.Weights(initializer = lbann.UniformInitializer(min=-bounds, max=bounds),
+                                    name=self.name+'_Weights_2')
+        self.W1 = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]),
+                                    name=self.name+'_param_1',
+                                    weights = self.weights_1)
+        self.W2 = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]),
+                                    name=self.name+'_param_2',
+                                    weights = self.weights_2)
+    def forward(self, X, A):
+        messages = lbann.MatMul(X, self.W2, name=self.name+'_w2_mult')
+        messages = lbann.MatMul(A,messages,name=self.name+'_adj_mult')
+
+        ident = lbann.MatMul(X, self.W1, name=self.name+'_w1_mult')
+
+        out = lbann.Sum(ident, messages, name=self.name+'_sum_id')
+
+        return out
diff --git a/python/lbann/modules/graph/dense/__init__.py b/python/lbann/modules/graph/dense/__init__.py
new file mode 100644
index 00000000000..31f52337832
--- /dev/null
+++ b/python/lbann/modules/graph/dense/__init__.py
@@ -0,0 +1,7 @@
+from .DenseGCNConv import  DenseGCNConv
+from .DenseGraphConv import DenseGraphConv
+
+__all__ = [
+    'DenseGCNConv'
+    'DenseGraphConv'
+    ]
diff --git a/python/lbann/modules/graph/sparse/GCNConv.py b/python/lbann/modules/graph/sparse/GCNConv.py
new file mode 100644
index 00000000000..5f44b4e29bc
--- /dev/null
+++ b/python/lbann/modules/graph/sparse/GCNConv.py
@@ -0,0 +1,120 @@
+import lbann
+from lbann.modules import Module
+from lbann.modules.graph.utils import GraphVertexData
+from lbann.util import str_list
+import lbann.modules.base
+import math 
+
+class GCNConv(Module):
+    """GCN Conv later. See: 
+
+    https://arxiv.org/abs/1609.02907
+
+    """
+    
+    global_count = 0
+
+    def __init__(self, 
+                 input_channels,
+                 output_channels,
+                 bias=True,
+                 activation = lbann.Relu,
+                 name=None,
+                 data_layout = 'data_parallel'):
+        """Initialize GCN layer
+        
+        Args: 
+            input_channels (int): The size of the input node features 
+            output_channels (int): The output size of the node features 
+            bias (bool): Whether to apply biases after MatMul 
+            activation (type): Activation leyer for the node features. If None, then no activation is 
+                                applied. (default: lbann.Relu)
+            name (str): Default name of the layer is GCN_{number}
+            data_layout (str): Data layout 
+        """
+        super().__init__()
+        
+        ## Add variables
+        
+        self.input_channels = input_channels
+        self.output_channels = output_channels
+        self.data_layout = data_layout
+
+        ## Add Name for the components for the layer
+        GCNConv.global_count +=1
+        self.name = (name 
+                     if name 
+                     else 'GCN_{}'.format(GCNConv.global_count))
+        
+        ## Initialize weights for the matrix
+        value  = math.sqrt(6/ (input_channels + output_channels))
+
+        self.mat_weights = lbann.Weights(initializer = lbann.UniformInitializer(
+                                                       min = -value,
+                                                       max = value),
+                                    name = self.name+'_Weights')
+
+        self.W = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]),
+                                    name = self.name+'_layer',
+                                    weights = self.mat_weights,
+                                    data_layout = self.data_layout)
+        
+        ## Initialize bias variables
+        self.has_bias = bias
+        self.bias_weights = None
+        self.bias = None
+
+        if (self.has_bias):
+            self.bias_weights = lbann.Weights(initializer = lbann.ConstantInitializer(
+                                                            value = 0.0),
+                                              name = self.name+'_bias_weights')
+            self.bias = lbann.WeightsLayer(dims = str_list([1,output_channels]), 
+                                           weights = self.bias_weights, 
+                                           name = self.name+'_bias_layer',
+                                           data_layout = self.data_layout)
+
+        self.activation = None 
+
+        if activation:
+            if isinstance(activation, type):
+                self.activation = activation 
+            else:
+                self.activation = type(actvation)
+            if not issubclass(self.activation, lbann.Layer):
+                raise ValueError('activation must be a layer') 
+    
+    def forward(self, X, A):
+        """Apply GCN
+
+        Args:
+            X (GraphVertexData): LBANN Data object, which is a collection of Layers. Each Layer is of
+                                 the shape (1,input_channels) 
+            A (Layer): Adjacency matrix input with shape (num_nodes, num_nodes)
+                                applied. The adjacency matrix is assumed to be normalized in the 
+                                pre-processing step. 
+        Returns:     
+            LBANN_Data_Mat: The output after GCN. The output can passed into another Graph Conv layer
+                          directly
+        """
+        
+        # Assume X is a lbann data object
+        for i in range(X.shape[0]):
+            X[i] = lbann.MatMul(X[i], self.W, name=self.name+'_message_'+str(i))
+            if (self.bias):
+                X[i] = lbann.Sum(X[i], self.bias, name=self.name+'_message_bias_'+str(i))
+
+        # Pass Message to Node Features
+        out = X.get_mat(self.output_channels)
+        
+        # A - adjacency matrix is assumed to be normalized such that 
+        # A = D^-0.5 A D^0.5 as the convention in the GCN paper.  
+        out = lbann.MatMul(A, out, name=self.name+'_aggregate')
+        
+        if self.activation:
+            out = self.activation(out)
+
+        out = GraphVertexData.matrix_to_graph(out, X.shape[0], self.output_channels)
+        
+        return out 
+ 
+
diff --git a/python/lbann/modules/graph/sparse/GINConv.py b/python/lbann/modules/graph/sparse/GINConv.py
new file mode 100644
index 00000000000..cf5de075cca
--- /dev/null
+++ b/python/lbann/modules/graph/sparse/GINConv.py
@@ -0,0 +1,76 @@
+import lbann 
+from lbann.modules import Module 
+from lbann.modules.graph.utils import GraphVertexData
+from lbann.util import str_list
+
+class GINConv(Module):
+    """Details of the kernel is available in: 
+       https://arxiv.org/abs/1810.00826
+    """
+    global_count = 0; 
+
+    def __init__(self, 
+                 sequential_nn,
+                 output_channels,
+                 eps = 1e-6,
+                 name = None,
+                 data_layout = 'data_parallel'):
+        """Initialize graph kernel as described in Graph Isomorphism Network.
+           
+        Args:
+            sequential_nn ([Module] or (Module)): A list or tuple of layer modules to be used  
+            output_channels (int): The output size of the node features
+            eps (float): Default value is 1e-6
+            name (str): Default name of the layer is GIN_{number}
+            data_layout (str): Data layout
+        """
+        GINConv.global_count += 1
+        self.name = (name 
+                     if name 
+                     else 'GIN_{}'.format(GINConv.global_count))
+        self.data_layout = data_layout
+        self.nn = sequential_nn
+        self.eps = eps 
+        self.output_channels = output_channels
+
+
+    def forward(self, X, A, activation = lbann.Relu):
+        """Apply GIN  Layer. 
+        
+        Args:
+            X (GraphVertexData): LBANN Data object, which is a collection of Layers. Each Layer is of
+                                 the shape (1,input_channels) 
+
+            A (Layer): Adjacency matrix input with shape (num_nodes, num_nodes)
+
+            activation (Layer): Activation layer for the node features. If None, then no activation is 
+                                applied. (default: lbann.Relu) 
+        Returns: 
+            
+            (GraphVertexData): The output after GCN. The output can passed into another Graph Conv layer
+                          directly
+        """
+        in_channel = X.shape[1]
+
+        # Accumulate Messages from Neighboring Nodes
+        out = X.get_mat()
+        out = lbann.MatMul(A,out, name = self.name+"_GIN_MATMUL")
+        message = GraphVertexData.matrix_to_graph(out, X.shape[0], in_channel)
+
+        # Aggregate Messages into node features  
+        eps = lbann.Constant(value=(1+self.eps),num_neurons = str_list([1, in_channel]))
+        for node_feature in range(X.shape[0]):
+            eps_val = lbann.Multiply(eps, X[node_feature])
+            X[node_feature] = lbann.Sum(message[node_feature], eps_val)
+        
+        # Transform with the sequence of linear layers
+        for layer in self.nn:
+            for node_feature in range(X.shape[0]):
+                X[node_feature] = layer(X[node_feature])
+        
+        ## Apply activation 
+        if activation:
+            for node_feature in range(X.shape[0]):
+                X[node_feature] = activation(X[node_feature])
+        X.update_num_features(self.output_channels) 
+        return X
diff --git a/python/lbann/modules/graph/sparse/GatedGraphConv.py b/python/lbann/modules/graph/sparse/GatedGraphConv.py
new file mode 100644
index 00000000000..9cf2f09d7cd
--- /dev/null
+++ b/python/lbann/modules/graph/sparse/GatedGraphConv.py
@@ -0,0 +1,98 @@
+import lbann 
+from lbann.modules import Module 
+from lbann.util import str_list
+from lbann.modules.graph.utils import GraphVertexData
+import lbann.modules
+import math 
+
+class GatedGraphConv(Module):
+    """Gated Graph Convolution layer. For kernel details, see: 
+
+    https://arxiv.org/abs/1511.05493
+
+    Implementation in the spirit of:
+
+    https://github.com/rusty1s/pytorch_geometric/blob/\
+    master/torch_geometric/nn/conv/gated_graph_conv.py 
+    """
+    global_count = 0
+    def __init__(self, 
+                 output_channels,
+                 num_layers = 1,
+                 name = None):
+        """Initialize GatedGraph layer
+        Args: 
+            output_channels (int): The output size of the node features 
+            num_layers (int): Number of passes through the GRU (default: 1) 
+            name (str): Name of the layers and prefix to use for the layers. 
+            data_layout (str): Data layout (default: data parallel)  
+        """
+        super().__init__()
+
+        ## Add Name for the components for the layer 
+        GatedGraphConv.global_count +=1
+        self.name = (name 
+                    if name 
+                    else 'GatedGraphConv_{}'.format(GatedGraphConv.global_count))
+
+
+        ## Add variables
+        self.output_channels = output_channels
+        self.rnn  = lbann.modules.GRU(output_channels)
+
+        self.num_layers = num_layers
+        self.data_layout = data_layout
+
+        self.weights = [] 
+
+        for i in range(num_layers):
+            
+            weight_init = lbann.Weights(initializer = lbann.UniformInitializer(min =-1/(math.sqrt(output_channels)), 
+                                                                               max = 1/(math.sqrt(output_channels))))
+            weight_layer = lbann.WeightsLayer(dims = str_list([output_channels, output_channels]),
+                                              weights = weight_init, 
+                                              name = self.name+'_'+str(i)+'_weight',
+                                              data_layout = self.data_layout)
+            self.weights.append(weight_layer)
+        
+
+    def forward(self, X, A):
+        """Call the GatedGraphConv
+        Args:
+            X (GraphVertexData): LBANN Data object, which is a collection of Layers. Each Layer is of
+                                 the shape (1,input_channels) 
+            A (Layer): Adjacency matrix input with shape (num_nodes, num_nodes)
+        Returns: 
+            LBANN_Data_Mat: The output after Gated Graph Kernel. 
+                        The output can passed into another Graph Conv layer directly
+
+        """
+
+        input_features = X.size(1)
+        num_nodes = X.size(0)
+
+        if (input_features < self.output_channels):
+            for i in range(num_nodes):
+                num_zeros = self.output_channels - input_features 
+                zeros = lbann.Constant(value = 0, num_neurons = str_list([1,num_zeros]), name = self.name+'_zero_'+str(i))
+                X[i] = lbann.Concatenation(X[i], zeros, axis = 1)       
+        elif (input_features > self.output_channels):
+            ValueError('The feature size of the nodes {} cannot be greater than the output dimension {}'.
+                        format(input_features, self.output_channels))
+        
+        X.update_num_features(self.output_channels)
+
+        for layer in range(self.num_layers): 
+            ##
+            X_mat = X.get_mat()
+            messages = lbann.MatMul(X_mat, self.weights[layer]) 
+            aggregate = lbann.MatMul(A,messages)
+
+            M = GraphVertexData.matrix_to_graph(aggregate, num_nodes, self.output_channels)
+
+            for i in range(num_nodes):
+                X[i] = lbann.Reshape(X[i], dims = str(self.output_channels))
+                X[i] = lbann.Reshape(self.rnn(M[i], X[i])[1],
+                                        dims = str_list([1, self.output_channels]))
+        
+        return X
diff --git a/python/lbann/modules/graph/sparse/GraphConv.py b/python/lbann/modules/graph/sparse/GraphConv.py
new file mode 100644
index 00000000000..327d47bb945
--- /dev/null
+++ b/python/lbann/modules/graph/sparse/GraphConv.py
@@ -0,0 +1,132 @@
+import lbann
+from lbann.modules import Module
+from lbann.modules.graph.utils import GraphVertexData
+from lbann.util import str_list
+import lbann.modules.base
+import math 
+
+class GraphConv(Module):
+    """ Graph Conv layer. See: 
+
+    https://arxiv.org/abs/1609.02907
+    
+    """
+    
+    global_count = 0
+
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 bias=True,
+                 activation = lbann.Relu,
+                 name=None,
+                 data_layout = 'data_parallel'):
+        """Initialize Graph layer
+
+        Args: 
+            input_channels (int): The size of the input node features 
+            output_channels (int): The output size  of the node features 
+            bias (bool): Whether to apply biases after MatMul 
+            name (str): Default name of the layer is GCN_{number}
+            data_layout (str): Data layout
+            activation (type): Activation layer for the node features. If None, then no activation is 
+                                applied. (default: lbann.Relu)
+        """
+        super().__init__()
+        
+        ## Add variables
+        
+        self.input_channels = input_channels
+        self.output_channels = output_channels
+        self.data_layout = data_layout
+
+        ## Add Name for the components for the layer
+        GraphConv.global_count +=1
+        self.name = (name 
+                     if name 
+                     else 'Graph_{}'.format(GraphConv.global_count))
+        
+        ## Initialize weights for the matrix
+        value  = math.sqrt(6/ (input_channels + output_channels))
+
+        self.mat_weights = lbann.Weights(initializer = lbann.UniformInitializer(
+                                                       min = -value,
+                                                       max = value),
+                                         name = self.name+'_Weights')
+
+        self.weights1 = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]),
+                                    name = self.name+'_layer',
+                                    weights = self.mat_weights)
+
+        self.id_weights = lbann.Weights(initializer = lbann.UniformInitializer(
+                                                      min  = -value,
+                                                      max = value),
+                                         name = self.name+'_ID_Weights')
+
+        self.weights2 = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]),
+                                    name = self.name+'_ID_layer',
+                                    weights = self.id_weights) 
+        
+        ## Initialize bias variables
+        self.has_bias = bias
+        self.bias_weights = None
+        self.bias = None
+
+        if (self.has_bias):
+            self.bias_weights = lbann.Weights(initializer = lbann.ConstantInitializer(
+                                                            value = 0.0),
+                                              name = self.name+'_bias_weights')
+            self.bias = lbann.WeightsLayer(dims = str_list([1,output_channels]), 
+                                           weights = self.bias_weights, 
+                                           name = self.name+'_bias_layer')
+        
+        self.activation = None 
+
+        if activation:
+            if isinstance(activation, type):
+                self.activation = activation 
+            else:
+                self.activation = type(actvation)
+            if not issubclass(self.activation, lbann.Layer):
+                raise ValueError('activation must be a layer')
+    
+    def forward(self, X, A):
+        """Apply Graph Conv Layer to X and use A for message passing
+
+        Args:
+            X (GraphVertexData): LBANN Data object, which is a collection of Layers. Each Layer is of
+                                 the shape (1,input_channels) 
+
+            A (Layer): Adjacency matrix input with shape (num_nodes, num_nodes)
+
+        Returns: 
+            
+            GraphVertexData: The output after convolution. The output can passed into another Graph Conv layer
+                          directly
+        """ 
+        
+        # Accumulate Messages from Neighboring Nodes
+        out = X.get_mat()
+        out = lbann.MatMul(out,self.weights1, name = self.name+"_Graph_MATMUL")
+        message  = lbann.MatMul(A, out, name = self.name+"_Graph_Message")
+        message = GraphVertexData.matrix_to_graph(message, X.shape[0], self.output_channels)
+
+        # Assume X is a GraphVertexData object
+        
+        for node_feature in range(X.shape[0]):
+            X[node_feature] = lbann.MatMul(X[node_feature], self.weights2)
+        
+        for node_feature in range(X.shape[0]):
+            if (self.bias):
+                message[node_feature] = lbann.Sum(message[node_feature], 
+                                                  self.bias,
+                                                  name=self.name+'_message_bias_'+str(node_feature))
+            X[node_feature] = lbann.Sum(X[node_feature], message[node_feature])
+ 
+        if self.activation:
+            for node_feature in range(X.shape[0]):
+                X[node_feature] = self.activation(X[node_feature])
+
+        X.update_num_features(self.output_channels) 
+        return X
+
diff --git a/python/lbann/modules/graph/sparse/__init__.py b/python/lbann/modules/graph/sparse/__init__.py
new file mode 100644
index 00000000000..9b089525863
--- /dev/null
+++ b/python/lbann/modules/graph/sparse/__init__.py
@@ -0,0 +1,11 @@
+"Neural network modules for graph convolutional models.""" 
+from .GINConv  import GINConv
+from .GCNConv import GCNConv
+from .GraphConv import GraphConv
+from .GatedGraphConv import GatedGraphConv
+__all__ = [
+    'GCNConv',
+    'GINConv', 
+    'GraphConv',
+    'GatedGraphConv'
+    ]
diff --git a/python/lbann/modules/graph/utils.py b/python/lbann/modules/graph/utils.py
new file mode 100644
index 00000000000..7c9aae6fd94
--- /dev/null
+++ b/python/lbann/modules/graph/utils.py
@@ -0,0 +1,103 @@
+import lbann
+from lbann.util import str_list
+
+class GraphVertexData:
+    def __init__(self, layers, num_features):
+        """Object to hold list of layers, where each layer represents a vertex
+           in a graph.
+
+           Args:
+               layers (iterator of layers): One dimensional iterator of node 
+                                            features with N number of ndoes
+               num_features (int) : the number of features per vertex
+           
+        """
+        self.shape = (len(layers), num_features)
+        self.layers = layers
+        self.num_nodes = len(layers)
+        self.num_features = num_features
+
+    def __getitem__(self, node):
+        """Get the feature vector of the None node represented as an LBANN layer
+        
+            args: node (int): The node to retrieve the features for. 
+
+            returns: (Layer) : returns the features of the  Vertex <node> of  the graph.
+                    
+        """
+        return self.layers[node]
+    def __setitem__(self, node, feature):
+        """Set the value of the row-th layer in 
+           args: row (int):
+                 layer (Layer): 
+        """
+        self.layers[node] = feature
+    def update_num_features(self, num_features):
+        """Update the internal shapes to keep track of features
+
+        Args: 
+             num_features (int): the features per vertex
+        """
+        self.num_features = num_features 
+        self.shape = (len(self.layers), num_features)
+    def size(self, index = None):
+        """Get the size (shape) of the GraphVertexObject, where the size is represented
+           as a tuple (n,m), where n is the number of nodes and m is the number of 
+           features per node. 
+
+           args: index (int): 0 to return the number of nodes and 1 to return the number of
+                               features. 
+           returns: (int) or (int,int): Either returns the tuple (n,m) or n or m. 
+
+        """
+        if isinstance(index,int):
+            return self.shape[index]
+        else:
+            return self.shape
+
+    def get_mat(self, cols = None):
+        """Generates a matrix representation of the graph data.
+
+           args: cols (int) 
+        """
+        
+        mat = lbann.Concatenation(self.layers)
+
+        if (cols):
+            mat = lbann.Reshape(mat, dims=str_list([self.shape[0], cols]))    
+        else:
+            mat = lbann.Reshape(mat, dims=str_list([self.shape[0], self.shape[1]]))
+
+        return mat
+   
+    def clone(self):
+        """Generates a clone of the GraphVertexData object. Results in a 
+           splitting in the DAG.
+        """
+        cloned_layers = [] 
+
+        for i,node in enumerate(self.layers):
+            temp = lbann.Split(node)
+            layers[i] = lbann.Identity(temp)
+            cloned_layers.append(lbann.Identity(temp))
+
+
+        return GraphVertexData(cloned_layers, self.num_features)
+
+
+    @classmethod
+    def matrix_to_graph(cls, mat_layer, num_vertices, num_features):
+        """Given a 2D matrix of shape (num_vertices, num_features), returns a 
+           GraphVertexData object with num_vertices number of nodes with num_features. 
+           
+        """
+
+        slice_points = str_list([i for i in range(0,num_vertices * num_features + 1, num_features)])
+        flattened_layer = lbann.Reshape(mat_layer, dims = str(num_vertices * num_features))
+        sliced_mat_layer = lbann.Slice(flattened_layer, axis = 0, slice_points = slice_points)
+
+        list_of_layers = []
+        for node in range(num_vertices):
+            temp = lbann.Identity(sliced_mat_layer)
+            list_of_layers.append(lbann.Reshape(temp, dims=str_list([1, num_features])))
+        return cls(list_of_layers, num_features)

From 497da76eeba8e9aa26920dfb5c613da67ca352aa Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Tue, 18 Aug 2020 11:24:22 -0700
Subject: [PATCH 08/36] Reset data coordinator after each LTFB round (#1599)

---
 include/lbann/io/data_buffers/generic_io_buffer.hpp | 6 +++---
 src/callbacks/ltfb.cpp                              | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/lbann/io/data_buffers/generic_io_buffer.hpp b/include/lbann/io/data_buffers/generic_io_buffer.hpp
index a8d4f7ecec0..eedfccdac68 100644
--- a/include/lbann/io/data_buffers/generic_io_buffer.hpp
+++ b/include/lbann/io/data_buffers/generic_io_buffer.hpp
@@ -59,9 +59,9 @@ class fetch_data_functor {
       num_responses_fetched = data_reader->fetch_labels(responses);
     }
     if(num_samples_fetched != num_responses_fetched) {
-      std::string err = std::string("Number of samples: ") + std::to_string(num_samples_fetched)
-        + std::string(" does not match the number of responses: ") + std::to_string(num_responses_fetched);
-      throw lbann_exception(err);
+      LBANN_ERROR("Number of samples (",num_samples_fetched,") ",
+                  "does not match the ",
+                  "number of responses (",num_responses_fetched,")");
     }
     return num_samples_fetched;
   }
diff --git a/src/callbacks/ltfb.cpp b/src/callbacks/ltfb.cpp
index ace193e3fb9..c682e628a4c 100644
--- a/src/callbacks/ltfb.cpp
+++ b/src/callbacks/ltfb.cpp
@@ -339,7 +339,8 @@ EvalType evaluate(model& m, const std::string& metric_name) {
   m.make_data_store_preloaded(execution_mode::validation);
 
   // Clean up and return metric value
-  c.set_execution_mode(original_mode);
+  m.reset_mode(c, original_mode);
+  c.get_trainer().get_data_coordinator().reset_mode(c);
   return metric_value;
 
 }

From 3b58587c13d906169d30f6948ff850a5c71aeac4 Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Thu, 20 Aug 2020 11:13:09 -0700
Subject: [PATCH 09/36] Fix bug in weights proxy when weights buffer is
 reallocated (#1602)

---
 include/lbann/weights/weights_proxy.hpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/lbann/weights/weights_proxy.hpp b/include/lbann/weights/weights_proxy.hpp
index 989bf79c6c7..81316abeb51 100644
--- a/include/lbann/weights/weights_proxy.hpp
+++ b/include/lbann/weights/weights_proxy.hpp
@@ -262,8 +262,14 @@ class WeightsProxy
    */
   void synchronize_with_master()
   {
-    if (!empty() && !values_->Viewing()) {
-      El::Copy(master_weights_->get_values(), *values_);
+    if (!empty()) {
+      const auto& master_values = master_weights_->get_values();
+      if (values_->Viewing()) {
+        El::LockedView(*values_, dynamic_cast<const ValuesType&>(master_values));
+      }
+      else {
+        El::Copy(master_values, *values_);
+      }
     }
   }
 

From d5b9f084e03a60745a555bf712c98bc5fbea7120 Mon Sep 17 00:00:00 2001
From: Sam Ade Jacobs <jacobs32@llnl.gov>
Date: Mon, 24 Aug 2020 09:32:26 -0700
Subject: [PATCH 10/36] ATOM VAE model (#1601)

* draft implementation of ATOM VAE

* VAE draft

* VAE draft

* Add smaller (10K) dataset, and model cleanup

* Add smaller (10K) dataset, and model cleanup
---
 applications/ATOM/models/vae.py            | 164 ++++++++++++++
 applications/ATOM/train_atom_vae.py        | 251 +++++++++++++++++++++
 applications/ATOM/zinc10k_data_config.json |  10 +
 3 files changed, 425 insertions(+)
 create mode 100644 applications/ATOM/models/vae.py
 create mode 100644 applications/ATOM/train_atom_vae.py
 create mode 100644 applications/ATOM/zinc10k_data_config.json

diff --git a/applications/ATOM/models/vae.py b/applications/ATOM/models/vae.py
new file mode 100644
index 00000000000..64358b6d0b5
--- /dev/null
+++ b/applications/ATOM/models/vae.py
@@ -0,0 +1,164 @@
+import lbann
+import lbann.modules
+from math import sqrt
+from lbann.util import make_iterable
+
+def str_list(l):
+    """Convert an iterable object to a space-separated string."""
+    return ' '.join(str(i) for i in make_iterable(l))
+
+class MolVAE(lbann.modules.Module):
+    """Molecular VAE.
+
+    See:
+    https://github.com/samadejacobs/moses/tree/master/moses/vae
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, input_feature_dims,dictionary_size, embedding_size, ignore_label, name=None):
+        """Initialize Molecular VAE.
+
+        Args:
+            input_feature_dims (int): analogous to sequence length.
+            dictionary_size (int): vocabulary size
+            embedding_size (int): embedding size
+            ignore_label (int): padding index
+            name (str, optional): Module name
+                (default: 'molvae_module<index>').
+
+        """
+        MolVAE.global_count += 1
+        self.instance = 0
+        self.name = (name if name
+                     else 'molvae_module{0}'.format(MolVAE.global_count))
+
+        self.input_feature_dims = input_feature_dims
+        self.embedding_size = embedding_size
+        self.dictionary_size = dictionary_size
+        self.label_to_ignore = ignore_label
+
+        fc = lbann.modules.FullyConnectedModule
+        gru = lbann.modules.GRU
+        #Encoder
+        winit = lbann.GlorotNormalInitializer()
+        self.encoder_rnn = gru(size=256, name=self.name+'_encoder_rnn')
+        self.q_mu = fc(128,name=self.name+'_qmu')
+        self.q_logvar = fc(128,name=self.name+'_qlogvar')
+        #Decoder
+        self.decoder_rnn0 = gru(size=512, name=self.name+'_decoder_rnn0')
+        self.decoder_rnn1 = gru(size=512, name=self.name+'_decoder_rnn1')
+        self.decoder_rnn2 = gru(size=512, name=self.name+'_decoder_rnn2')
+        self.decoder_lat = fc(512,name=self.name+'_decoder_lat')
+        self.decoder_fc = fc(dictionary_size,name=self.name+'_decoder_fc')
+        #shared encoder/decodeer weights
+        self.emb_weights = lbann.Weights(initializer=lbann.NormalInitializer(mean=0, standard_deviation=1),
+                                   name='emb_matrix')
+
+    def forward(self, x):
+        """Do the VAE forward step
+
+        :param x: list of tensors of longs, embed representation of input
+        :return: float, kl term component of loss
+        :return: float, recon component of loss
+        """
+
+        emb = lbann.Embedding(x,
+                              num_embeddings=self.dictionary_size,
+                              embedding_dim=self.embedding_size,
+                              name='emb',
+                              weights=self.emb_weights)
+        emb_slice = lbann.Slice(emb,
+                                axis=0,
+                                slice_points=str_list(range(self.input_feature_dims+1)),
+                                name='emb_slice')
+        emb_list = [lbann.Reshape(emb_slice, dims='-1', name='emb'+str(i))
+                    for i in range(self.input_feature_dims)]
+
+        # Encoder: x -> z, kl_loss
+        z, kl_loss = self.forward_encoder(emb_list)
+
+        # Decoder: x, z -> recon_loss
+        recon_loss, arg_max = self.forward_decoder(x, emb_list, z)
+
+        return kl_loss, recon_loss, arg_max
+
+    def forward_encoder(self, emb_list):
+        """Encoder step, emulating z ~ E(x) = q_E(z|x)
+
+        :param embed_list: list of tensors of floats, input sentence emb_list
+        :return: (n_batch, d_z) of floats, sample of latent vector z
+        :return: float, kl term component of loss
+        """
+
+        h = lbann.Constant(value=0.0, num_neurons='256')
+        for i in range(self.input_feature_dims):
+            _, h = self.encoder_rnn(emb_list[i], h)
+
+        mu, logvar = self.q_mu(h), self.q_logvar(h)
+
+        # eps = torch.randn_like(mu)
+        eps = lbann.Gaussian(mean=0, stdev=1,hint_layer=mu)
+
+        # z = mu + (logvar / 2).exp() * eps
+        z = lbann.Add([mu, (lbann.Multiply([lbann.Exp(lbann.WeightedSum(logvar,scaling_factors='0.5')),eps]))])
+
+        # kl_loss = 0.5 * (logvar.exp() + mu ** 2 - 1 - logvar).sum(1).mean()
+        kl_loss = lbann.Reduction(lbann.WeightedSum(
+                                        [lbann.Exp(logvar),
+                                        lbann.Square(mu),
+                                        lbann.Constant(value=1.0, hint_layer=mu),
+                                        logvar],
+                                        scaling_factors='0.5 0.5 -0.5 -0.5'),
+                                        mode='sum')
+
+        return z, kl_loss
+
+    def forward_decoder(self, x, emb_list, z):
+        """Decoder step, emulating x ~ G(z)
+
+        :param x: list of tensors of longs, input sentence x
+        :param emb_list: embeddings of x
+        :param z: (n_batch, d_z) of floats, latent vector z
+        :return: float, recon component of loss
+        """
+
+        # x[:, 1:]
+        xshift = lbann.Slice(x, slice_points=str_list([1, self.input_feature_dims]))
+        xshift = lbann.Identity(xshift)
+        xshift_slice = lbann.Slice(xshift, slice_points=str_list(range(self.input_feature_dims)))
+        xshift_list = [lbann.Identity(xshift_slice) for i in range(self.input_feature_dims-1)]
+
+        # Unroll RNN
+        h = [self.decoder_lat(z)] * 3
+        recon_loss = []
+        arg_max = []
+        for i in range(self.input_feature_dims-1):
+
+            # RNN stack
+            x_input = lbann.Concatenation(emb_list[i], z)
+            _, h[0] = self.decoder_rnn0(x_input, h[0])
+            _, h[1] = self.decoder_rnn1(h[0], h[1])
+            _, h[2] = self.decoder_rnn2(h[1], h[2])
+            output = h[2]
+            #output = h[0]
+            y = self.decoder_fc(output)
+            arg_max.append(lbann.Argmax(y,device='CPU'))
+
+            # Cross entropy loss
+            y = lbann.Softmax(y)
+            xshift_onehot = lbann.OneHot(xshift_list[i], size=self.dictionary_size)
+            recon_loss.append(lbann.CrossEntropy(y, xshift_onehot))
+
+        # Average cross entropy over sequence length
+        pad_mask = lbann.NotEqual(xshift,
+                                  lbann.Constant(value=self.label_to_ignore, hint_layer=xshift))
+        length = lbann.Reduction(pad_mask, mode='sum')
+        length = lbann.Max(length, lbann.Constant(value=1, num_neurons="1"))
+        recon_loss = lbann.Concatenation(recon_loss)
+        recon_loss = lbann.Multiply(recon_loss, pad_mask)
+        recon_loss = lbann.Reduction(recon_loss, mode='sum')
+        recon_loss = lbann.Divide(recon_loss, length)
+
+        return recon_loss, arg_max
diff --git a/applications/ATOM/train_atom_vae.py b/applications/ATOM/train_atom_vae.py
new file mode 100644
index 00000000000..61119aebe1e
--- /dev/null
+++ b/applications/ATOM/train_atom_vae.py
@@ -0,0 +1,251 @@
+import argparse
+import datetime
+import os
+import os.path
+import sys
+
+from google.protobuf import text_format as txtf
+import json
+import numpy as np
+import vae as molvae
+
+import lbann
+import lbann.contrib.launcher
+import lbann.modules
+from lbann.util import str_list
+
+
+def construct_lc_launcher_args():
+
+    # defaults correspond to the settings needed for training on the moses dataset
+    parser = argparse.ArgumentParser(prog="lbann ATOM VAE training")
+    parser.add_argument("--partition", default=None)
+    parser.add_argument("--account", default="hpcdl")
+    parser.add_argument("--scheduler", type=str, default="slurm")
+    parser.add_argument(
+        "--data-module-file",
+        default="dataset.py",
+        help="specifies the module that contains the logic for loading data",
+    )
+    parser.add_argument(
+        "--data-config",
+        default=os.path.join(
+            os.path.abspath(os.path.dirname(__file__)), "zinc_data_config.json"
+        ),
+        help="path to a data config file that is used for the construction of python data reader",
+    )
+    parser.add_argument(
+        "--time-limit",
+        type=int,
+        default=720,
+        help="specified time limit in number of minutes",
+    )
+    parser.add_argument("--nodes", type=int, default=1)
+    parser.add_argument("--job-name", default="atom_vae")
+    parser.add_argument("--embedding-dim", type=int, default=None)
+    parser.add_argument("--num-embeddings", type=int, default=None)
+    parser.add_argument("--batch-size", type=int, default=512)
+    parser.add_argument("--num-epochs", type=int, default=20)
+    parser.add_argument("--data-reader-prototext", default=None)
+    parser.add_argument("--pad-index", type=int, default=None)
+    parser.add_argument("--sequence-length", type=int, default=None)
+    parser.add_argument("--dump_weights_dir", type=str, default="weights")
+    parser.add_argument("--num-samples", type=int, default=None)
+    parser.add_argument("--num-io-threads", type=int, default=11)
+    parser.add_argument("--vocab", default=None)
+    parser.add_argument("--delimiter", default="c")
+    parser.add_argument("--no-header", type=bool, default=True)
+
+    # these are specific to the Trainer object
+    parser.add_argument(
+        "--procs-per-trainer",
+        type=int,
+        default=0,
+        help="number of processes to use per trainer",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=3e-4,
+        help="optimizer learning rate to use for training",
+    )
+    return parser.parse_args()
+
+# ==============================================
+# Setup and launch experiment
+# ==============================================
+
+def construct_model(run_args):
+    """Construct LBANN model.
+
+    Initial model for ATOM molecular VAE
+
+    """
+    import lbann
+
+    pad_index = run_args.pad_index
+    assert pad_index is not None
+
+    sequence_length = run_args.sequence_length
+    assert sequence_length is not None
+
+    print("sequence length is {}".format(sequence_length))
+    data_layout = "data_parallel"
+    # Layer graph
+    input_ = lbann.Identity(lbann.Input(name='inp'), name='inp1')
+    vae_loss= []
+    input_feature_dims = sequence_length
+
+    embedding_size = run_args.embedding_dim
+    dictionary_size = run_args.num_embeddings
+    assert embedding_size is not None
+    assert dictionary_size is not None
+
+    kl, recon, arg_max = molvae.MolVAE(input_feature_dims,
+                                       dictionary_size,
+                                       embedding_size,
+                                       pad_index)(input_)
+
+    vae_loss.append(kl)
+    vae_loss.append(recon)
+    print("LEN vae loss ", len(vae_loss))
+    #metric layers
+    pred_tensor = lbann.Concatenation(arg_max[:-1], name='pred_tensor')
+
+    layers = list(lbann.traverse_layer_graph(input_))
+    # Setup objective function
+    weights = set()
+    for l in layers:
+      weights.update(l.weights)
+    l2_reg = lbann.L2WeightRegularization(weights=weights, scale=5e-4)
+    obj = lbann.ObjectiveFunction(vae_loss)
+
+    # Initialize check metric callback
+    metrics = [lbann.Metric(kl, name='kl_loss'),
+               lbann.Metric(recon, name='recon')
+                ]
+
+    callbacks = [lbann.CallbackPrint(),
+                 lbann.CallbackTimer(),
+                 lbann.CallbackDumpWeights(directory=run_args.dump_weights_dir, epoch_interval=10)]
+
+
+    # Construct model
+    return lbann.Model(run_args.num_epochs,
+                       weights=weights,
+                       layers=layers,
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+
+def construct_data_reader(run_args):
+    """
+    Construct Protobuf message for Python data reader.
+
+    The Python data reader will import this Python file to access the
+    sample access functions.
+
+    """
+
+    module_file = os.path.abspath(run_args.data_module_file)
+    os.environ["DATA_CONFIG"] = os.path.abspath(run_args.data_config)
+
+    module_name = os.path.splitext(os.path.basename(module_file))[0]
+    module_dir = os.path.dirname(module_file)
+
+    print("module_name: {}\tmodule_dir: {}".format(module_name, module_dir))
+
+    # Base data reader message
+    message = lbann.reader_pb2.DataReader()
+
+    # Training set data reader
+    data_reader = message.reader.add()
+    data_reader.name = "python"
+    data_reader.role = "train"
+    data_reader.shuffle = True
+    data_reader.percent_of_data_to_use = 1.0
+    data_reader.validation_percent = 0.1
+    data_reader.python.module = module_name
+    data_reader.python.module_dir = module_dir
+    data_reader.python.sample_function = "get_sample"
+    data_reader.python.num_samples_function = "num_samples"
+    data_reader.python.sample_dims_function = "sample_dims"
+
+    return message
+
+
+def main():
+    run_args = construct_lc_launcher_args()
+
+    # add data_config data
+    # and do not overwrite args if data_reader_prototext is enabled
+    if os.path.isfile(run_args.data_config) and not run_args.data_reader_prototext:
+        with open(run_args.data_config, "r") as f:
+            config = json.load(f)
+        for k, v in config.items():
+            setattr(run_args, k, v)
+
+    trainer = lbann.Trainer(
+        run_args.batch_size,
+        name=None,
+        procs_per_trainer=run_args.procs_per_trainer,
+    )
+
+    # define data_reader
+    if run_args.data_reader_prototext:
+        print("Using data_reader_prototext")
+        assert run_args.sequence_length is not None
+        assert run_args.vocab is not None
+
+        data_reader_proto = lbann.lbann_pb2.LbannPB()
+        with open(run_args.data_reader_prototext, "r") as f:
+            txtf.Merge(f.read(), data_reader_proto)
+        data_reader = data_reader_proto.data_reader
+    else:
+        data_reader = construct_data_reader(run_args)
+
+    if "LBANN_EXPERIMENT_DIR" in os.environ:
+        work_dir = os.environ["LBANN_EXPERIMENT_DIR"]
+    else:
+        work_dir = os.path.join(os.getcwd())
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    experiment_dir = os.path.join(
+        work_dir, "{}_{}".format(timestamp, run_args.job_name)
+    )
+    if not os.path.exists(experiment_dir):
+        os.makedirs(experiment_dir)
+
+    # model and optimizer
+    model = construct_model(run_args)
+    opt = lbann.Adam(learn_rate=run_args.lr, beta1=0.9, beta2=0.99, eps=1e-8)
+
+    # dump the config to the experiment_dir so that it can be used to load the model in pytorch (moses codebase)
+    ppn = 4 if run_args.scheduler == "lsf" else 2
+    print("args:\n" + str(run_args))
+    if(run_args.scheduler == 'slurm'):
+      import torch
+      torch.save(run_args, "{}/{}_config.pt".format(experiment_dir, run_args.job_name))
+
+    status = lbann.contrib.launcher.run(
+        trainer,
+        model,
+        data_reader,
+        opt,
+        #partition=run_args.partition,
+        scheduler=run_args.scheduler,
+        #account=run_args.account,
+        time_limit=run_args.time_limit,
+        nodes=run_args.nodes,
+        procs_per_node=ppn,
+        batch_job = True,
+        job_name=run_args.job_name,
+        experiment_dir=experiment_dir,
+        lbann_args=f"--vocab={run_args.vocab} --num_samples={run_args.num_samples} --sequence_length={run_args.sequence_length}  --num_io_threads={run_args.num_io_threads} --no_header={run_args.no_header} --delimiter={run_args.delimiter}",
+    )
+
+    print("LBANN launcher status:\n" + str(status))
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/applications/ATOM/zinc10k_data_config.json b/applications/ATOM/zinc10k_data_config.json
new file mode 100644
index 00000000000..a5432b1ff14
--- /dev/null
+++ b/applications/ATOM/zinc10k_data_config.json
@@ -0,0 +1,10 @@
+{
+
+  "pad_index": 27,
+  "sequence_length": 56,
+  "max_seq_len": 56,
+  "data_path" : "/p/gscratchr/brainusr/datasets/zinc/moses_zinc_train10K.npy",
+  "embedding_dim": 29,
+  "num_embeddings": 29
+
+}

From 77bb9f9449dfb143a98af6c4f0849dbfc258e457 Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Thu, 27 Aug 2020 12:40:53 -0700
Subject: [PATCH 11/36] Avoid char in hashing unit tests (#1607)

---
 src/utils/unit_test/hash_test.cpp | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/utils/unit_test/hash_test.cpp b/src/utils/unit_test/hash_test.cpp
index 7de4b81802a..4683853523b 100644
--- a/src/utils/unit_test/hash_test.cpp
+++ b/src/utils/unit_test/hash_test.cpp
@@ -10,7 +10,7 @@ TEST_CASE ("Testing convenience functions for hashing", "[hash][utilities]") {
 
   SECTION ("hash_combine") {
     std::unordered_set<size_t> hashes;
-    for (size_t seed=0; seed<10; ++seed) {
+    for (size_t seed=0; seed<=16; seed+=2) {
       hashes.insert(seed);
     }
     for (size_t seed=0; seed<=16; seed+=2) {
@@ -27,21 +27,29 @@ TEST_CASE ("Testing convenience functions for hashing", "[hash][utilities]") {
     std::vector<Humor> enum_list = { Humor::MELANCHOLIC, Humor::SANGUINE,
                                      Humor::CHOLERIC, Humor::PHLEGMATIC };
     std::unordered_set<size_t> hashes;
-    for (size_t i=0; i<enum_list.size(); ++i) {
-      const auto hash = lbann::enum_hash<Humor>()(enum_list[i]);
+    for (const auto val : enum_list) {
+      const auto hash = lbann::enum_hash<Humor>()(val);
       CHECK_FALSE(hashes.count(hash));
       hashes.insert(hash);
     }
   }
 
   SECTION ("pair_hash") {
+    const std::vector<unsigned long> i_list = {1, 2, 1018, 1019,
+                                               11209, 543210, 4294967295};
+    const std::vector<float> j_list = {-12.34f, -8.76f, -4.56f,
+                                       0.f, 4.56f, 8.76f, 12.34f};
     std::unordered_set<size_t> hashes;
-    for (char i=-12; i<=12; i+=3) {
-      for (unsigned long j=0; j<=11209; j+=1019) {
-        std::pair<char,unsigned long> val(i,j);
-        const auto hash = lbann::pair_hash<char,unsigned long>()(val);
-        CHECK_FALSE(hashes.count(hash));
-        hashes.insert(hash);
+    for (const auto i : i_list) {
+      for (const auto j : j_list) {
+        std::pair<unsigned long,float> val1(i,j);
+        const auto hash1 = lbann::pair_hash<unsigned long,float>()(val1);
+        CHECK_FALSE(hashes.count(hash1));
+        hashes.insert(hash1);
+        std::pair<float,unsigned long> val2(j,i);
+        const auto hash2 = lbann::pair_hash<float,unsigned long>()(val2);
+        CHECK_FALSE(hashes.count(hash2));
+        hashes.insert(hash2);
       }
     }
   }

From 9214f698aaaed7d563f9aac61bc58bca14dcb36d Mon Sep 17 00:00:00 2001
From: Tom Benson <30674819+benson31@users.noreply.github.com>
Date: Fri, 28 Aug 2020 11:13:58 -0700
Subject: [PATCH 12/36] Update FindPython.cmake (#1609)

If `VERSION_MINOR` or `VERSION_PATCH` is `0`, then the `find_package_handle_standard_args` function will fail (the 0 will be interpreted as `FALSE`). So just use the whole version in that function.
---
 cmake/modules/FindPython.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/FindPython.cmake b/cmake/modules/FindPython.cmake
index 39d5430461e..f30534e61c2 100644
--- a/cmake/modules/FindPython.cmake
+++ b/cmake/modules/FindPython.cmake
@@ -83,7 +83,7 @@ include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(
   Python
   REQUIRED_VARS Python_EXECUTABLE Python_INCLUDE_DIRS Python_LIBRARIES
-  Python_VERSION_MAJOR Python_VERSION_MINOR Python_VERSION_PATCH
+  Python_VERSION
   VERSION_VAR Python_VERSION)
 
 # Build the imported target

From 451ee5a9a37f22cf9fcf306b5cb709bba854304c Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Fri, 28 Aug 2020 15:48:14 -0700
Subject: [PATCH 13/36] Acquire IO RNG objects in synthetic data reader (#1611)

---
 src/data_readers/data_reader_synthetic.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/data_readers/data_reader_synthetic.cpp b/src/data_readers/data_reader_synthetic.cpp
index 7088b9b908b..012db31307a 100644
--- a/src/data_readers/data_reader_synthetic.cpp
+++ b/src/data_readers/data_reader_synthetic.cpp
@@ -74,6 +74,7 @@ bool data_reader_synthetic::fetch_label(CPUMat& Y, int data_id, int mb_idx) {
   if (m_num_labels == 0) {
     LBANN_ERROR("Synthetic data reader does not have labels");
   }
+  auto io_rng = set_io_generators_local_index(0);
   Y.Set(fast_rand_int(get_fast_io_generator(), m_num_labels), mb_idx, 1);
   return true;
 }

From ebadc65685ac659e35243bf26bac909d17583316 Mon Sep 17 00:00:00 2001
From: Katie Graham <50850420+graham63@users.noreply.github.com>
Date: Mon, 31 Aug 2020 17:05:27 -0700
Subject: [PATCH 14/36] first draft of callback docs (#1555)

* first draft of callback docs

* Updated name of autoencoder strategy to track_sample_ids

* Formatting changes in callback docs

* Removed mini_batch_size from model example, removed shell files for callback docs

* Fixed issue with code block

* Update docs/callbacks.rst

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>

* Update docs/callbacks.rst

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>

* Update docs/callbacks.rst

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>

* Update docs/callbacks/summarize_images.rst

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>

* Update docs/callbacks/summarize_images.rst

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>

* Update docs/callbacks/summarize_images.rst

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>

* Update docs/callbacks/summarize_images.rst

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>

* Update docs/callbacks/summarize_images.rst

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>

* Edits to callbacks.rst & summarize_images.rst

* Revised docs based on PR suggestions

* rework the doc structure slightly

Co-authored-by: Tom Benson <30674819+benson31@users.noreply.github.com>
Co-authored-by: Thomas R. Benson <benson31@llnl.gov>
---
 docs/callbacks.rst                            | 123 ++++++++++++
 .../categorical_accuracy_strategy.rst         |  59 ++++++
 .../track_sample_ids_strategy.rst             |  41 ++++
 docs/callbacks/summarize_images.rst           | 180 ++++++++++++++++++
 docs/index.rst                                |   1 +
 docs/running_lbann.rst                        |   4 +-
 6 files changed, 407 insertions(+), 1 deletion(-)
 create mode 100644 docs/callbacks.rst
 create mode 100644 docs/callbacks/selection_strategy/categorical_accuracy_strategy.rst
 create mode 100644 docs/callbacks/selection_strategy/track_sample_ids_strategy.rst
 create mode 100644 docs/callbacks/summarize_images.rst

diff --git a/docs/callbacks.rst b/docs/callbacks.rst
new file mode 100644
index 00000000000..bfb490ebc2b
--- /dev/null
+++ b/docs/callbacks.rst
@@ -0,0 +1,123 @@
+.. role:: python(code)
+          :language: python
+
+.. _callbacks:
+
+============================================================
+Callbacks
+============================================================
+
+LBANN has numerous callbacks that can be used to collect
+data about an experiment, such as scalars, metrics, weights,
+memory usage, images, etc. The documentation of many of these
+is pending; see the :ref:`list of Available
+Callbacks<available-callbacks>` for the documented ones.
+
+The callbacks are set to execute at various times, and can be
+used to display images according to either a boolean output or
+their global sample index.
+
+For a complete listing of callbacks and details about their
+functionality, please see :ref:`Available
+Callbacks<available-callbacks>`.
+
+.. _using-callbacks:
+
+------------------------------------------------
+Using Callbacks
+------------------------------------------------
+
+Callbacks are used by adding them to the python front end with the
+appropriate arguments and passing them as a list into the model.
+For example, the callbacks timer, print_statistics, and save model
+could be included with the following:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Python Front End
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+   timer = lbann.CallbackTimer()
+   print_stats = lbann.CallbackPrintStatistics(
+                 batch_interval=5)
+   save_model = lbann.CallbackSaveModel(
+                dir=".",
+                disable_save_after_training=True)
+
+   callbacks = [timer,
+                print_stats,
+                save_model]
+
+   model = lbann.Model(num_epochs,
+                       layers,
+                       objective_function,
+                       metrics,
+                       callbacks)
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Profobuf (Advanced)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block::
+
+   callback {
+     timer {
+     }
+     print_atatistics {
+       batch_interval: 5
+     }
+     save_model {
+       dir: "."
+       disable_save_after_training: true
+     }
+   }
+
+.. _available-callbacks:
+
+------------------------------------------------
+Available Callbacks
+------------------------------------------------
+
+.. toctree::
+   :maxdepth: 1
+
+   Check dataset <callbacks/check_dataset>
+   Check gradients <callbacks/check_gradients>
+   Check init <callbacks/check_init>
+   Check metric <callbacks/check_metric>
+   Check nan in activation values <callbacks/check_nan>
+   Check matrices in small values <callbacks/check_small>
+   Checkpoint <callbacks/checkpoint>
+   Confusion matrix <callbacks/confusion_matrix>
+   Debug <callbacks/debug>
+   Debug io <callbacks/debug_io>
+   Dump error signals <callbacks/dump_error_signals>
+   Dump gradients <callbacks/dump_gradients>
+   Dump minibatch sample indices <callbacks/dump_minibatch_sample_indices>
+   Dump outputs <callbacks/dump_outputs>
+   Dump weights <callbacks/dump_weights>
+   Early stopping <callbacks/early_stopping>
+   Gpu memory usage <callbacks/gpu_memory_usage>
+   Hang <callbacks/hang>
+   Imcomm <callbacks/imcomm>
+   Learning rate <callbacks/learning_rate>
+   Load model <callbacks/load_model>
+   Ltfb <callbacks/ltfb>
+   Mixup <callbacks/mixup>
+   Monitor io <callbacks/monitor_io>
+   Perturb adam <callbacks/perturb_adam>
+   Perturb dropout <callbacks/perturb_dropout>
+   Print model description <callbacks/print_model_description>
+   Print statistics <callbacks/print_statistics>
+   Profiler <callbacks/profiler>
+   Replace weights <callbacks/replace_weights>
+   Save images <callbacks/save_images>
+   Save model <callbacks/save_model>
+   Save topk models <callbacks/save_topk_models>
+   Summarize images <callbacks/summarize_images>
+   Summary <callbacks/summary>
+   Sync layers <callbacks/sync_layers>
+   Timeline <callbacks/timeline>
+   Timer <callbacks/timer>
+   Variable minibatch <callbacks/variable_minibatch>
diff --git a/docs/callbacks/selection_strategy/categorical_accuracy_strategy.rst b/docs/callbacks/selection_strategy/categorical_accuracy_strategy.rst
new file mode 100644
index 00000000000..f575997d991
--- /dev/null
+++ b/docs/callbacks/selection_strategy/categorical_accuracy_strategy.rst
@@ -0,0 +1,59 @@
+.. role:: python(code)
+          :language: python
+
+==============================
+Categorical Accuracy Strategy
+==============================
+
+----------
+Summary
+----------
+
+The :python:`CategoricalAccuracyStrategy` is used to view a snapshot
+of images in the dataset being used in the training session that match
+a boolean criterion. To simplify things in the model construction,
+this strategy can print images whose output is :python:`true`, images
+whose output is :python:`false`, or all images.  A canonical use-case
+is to print the images that are (in)correctly categorized by a
+classification model. The number of images output is limited by a
+user-provided parameter or until no more matches are found.
+
+.. note:: The name of this class erroneously suggests a rather narrow
+          use-case. We are looking to change the name in a future
+          release of LBANN. In fact, this strategy can take as input
+          any boolean layer, not just categorical accuracy layers.
+
+----------
+Arguments
+----------
+
++ :python:`categorical_accuracy_layer_name` (string): The name of the
+  boolean layer to be used to determine matches. A Python Front-End
+  layer's name can be accessed via the :python:`name` attribute. A
+  common use-case is the name of a :python:`CategoricalAccuracy` layer
+  that has been added to a model.
+
++ :python:`match_type`
+  (:python:`lbann.CategoricalAccuracyStrategy.MatchType`): Criterion for
+  selecting images to output. Possible values are:
+
+  =================  =======================================================
+  :python:`NOMATCH`  Output images corresponding to :python:`false` values.
+  :python:`MATCH`    Output images corresponding to :python:`true` values.
+  :python:`ALL`      Output all images.
+  =================  =======================================================
+
+  The default value is :python:`NOMATCH`.
+  
++ :python:`num_images_per_epoch` (uint): The maximum number of images to
+  output per epoch. The default value is 10.
+
+----------
+Usage
+----------
+
+See the :ref:`usage example<cat_acc_strategy_example>` as part of
+the :doc:`CallbackSummarizeImages </callbacks/summarize_images>`
+documentation.
+
+
diff --git a/docs/callbacks/selection_strategy/track_sample_ids_strategy.rst b/docs/callbacks/selection_strategy/track_sample_ids_strategy.rst
new file mode 100644
index 00000000000..252c16fc379
--- /dev/null
+++ b/docs/callbacks/selection_strategy/track_sample_ids_strategy.rst
@@ -0,0 +1,41 @@
+.. role:: python(code)
+          :language: python
+
+==============================
+Track Sample IDs Strategy
+==============================
+
+----------
+Summary
+----------
+
+The :python:`TrackSampleIDsStrategy` selection strategy is used by
+:python:`CallbackSummarizeImages` to output a constant set of images
+over the duration of a training run of LBANN.  Use of this strategy is
+ideally suited to generative applications, as it allows users to
+visualize the ability of a network to reproduce the same image over
+time.
+
+----------
+Arguments
+----------
+
++ :python:`input_layer_name`: the name of the input layer with the
+  original images. For reasons inherent to the C++ code, this must be
+  an :python:`Input` layer. A Python Front-End layer's name can be
+  accessed via the :python:`name` attribute.
+
++ :python:`num_tracked_images`: the number of images to track. If
+  unset, 10 images will be tracked. This is a proxy for the user
+  specifying images to track based on some unique identifier. We are
+  considering methods to expose this functionality; this is work in
+  progress.
+
+----------
+Usage
+----------
+
+See the :ref:`usage example<sample_id_strategy_example>` as part of
+the :doc:`CallbackSummarizeImages </callbacks/summarize_images>`
+documentation.
+
diff --git a/docs/callbacks/summarize_images.rst b/docs/callbacks/summarize_images.rst
new file mode 100644
index 00000000000..5b473175253
--- /dev/null
+++ b/docs/callbacks/summarize_images.rst
@@ -0,0 +1,180 @@
+.. role:: python(code)
+          :language: python
+
+.. role:: c(code)
+          :language: c
+
+.. _summarize-images-callback:
+
+============================================================
+Summarize Images Callback
+============================================================
+
+The purpose of this callback is to output images into an event file at
+the end of each epoch, according to the specified intervals. The
+images in the event file are displayed using `Tensorboard
+<https://www.tensorflow.org/tensorboard>`_. This callback could be
+used, for example, to display categorized images or images generated
+by an autoencoder or by a GAN.
+
+The method of selecting images, and the layers from which images are
+displayed, can be controlled via the :python:`selection_strategy`
+argument to the callback. Images that match some boolean value may be
+selected with :python:`CategoricalAccuracyStrategy`. A canonical
+example of this would be to output images that are classified
+incorrectly by a classification network. Alternatively, a fixed number
+of images can be displayed using
+:python:`TrackSampleIDsStrategy`. This may be used, for example, to
+visualize the progress in training a GAN or an autoencoder.
+
+---------------------------------------------
+Execution Points
+---------------------------------------------
+
++ After each testing/validation minibatch
+
+---------------------------------------------
+Callback Arguments (Python Front-End)
+---------------------------------------------
+
++ :python:`selection_strategy`: The image selection
+  strategy. Currently supported options are:
+
+  - :doc:`TrackSampleIDsStrategy <selection_strategy/track_sample_ids_strategy>`
+  - :doc:`CategoricalAccuracyStrategy <selection_strategy/categorical_accuracy_strategy>`
+
++ :python:`image_source_layer_name`: The name of the layer from which
+  images will be pulled. A Python Front-End layer's name can be
+  accessed via the :python:`name` attribute. This may be the input
+  layer, if the true image is requested, or it may be any layer that
+  outputs a valid image tensor. This means it must be either
+  a 2-D tensor (greyscale image) or a 3-D tensor with the channel
+  dimension equal to 1 or 3 (greyscale or RGB, respectively).
+
++ :python:`epoch_interval`: Epoch frequency to output images. The
+  default value is 1; that is, perform the output every epoch.
+
+
+---------------------------------------------
+Examples Using Summarize Images Callback
+---------------------------------------------
+
+Python Front-End
+--------------------
+
+.. _sample_id_strategy_example:
+
+Track Sample IDs Strategy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. note:: There is currenly no built-in way to print the
+          original images using a single callback instance. As a
+          work-around, if the original image is desired, add a
+          second instance of the :python:`CallbackSummarizeImages`
+          with the :python:`image_source_layer_name` field set to
+          the input layer's name and the :python:`epoch_interval`
+          field set to be larger than the total number of epochs you
+          expect to run (so it will only output from epoch 0 and
+          never again).
+
+.. code-block:: python
+
+    # Set up image selection strategy
+    img_strategy = lbann.TrackSampleIDsStrategy(
+                     input_layer_name="input",
+                     num_tracked_images=10)
+
+    # Pass parameters to callback
+    summarize_images = lbann.CallbackSummarizeImages(
+                         selection_strategy=img_strategy,
+                         image_source_layer_name="reconstruction",
+                         epoch_interval=5)
+
+    # Optional- Output original image from input layer once using
+    #           a high epoch interval
+    summarize_input_layer = lbann.CallbackSummarizeImages(
+                              selection_strategy=img_strategy,
+                              image_source_layer_name="input",
+                              epoch_interval=10000)
+
+.. _cat_acc_strategy_example:
+
+Categorical Accuracy Strategy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+    # Set up categorical accuracy layer
+    accuracy = lbann.CategoricalAccuracy(prediction_scores, labels)
+
+    # Set up image selection criteria
+    match_type = lbann.CategoricalAccuracyStrategy.MatchType
+
+    # Set up image selection strategy
+    img_strategy = lbann.CategoricalAccuracyStrategy(
+                     cat_accuracy_layer_name=accuracy.name,
+                     match_type.NOMATCH,
+                     num_images=10)
+
+    # Pass parameters to callback
+    summarize_images = lbann.CallbackSummarizeImages(
+                         selection_strategy=img_strategy,
+                         image_source_layer_name=images.name,
+                         epoch_interval=5)
+
+
+Profotext (Advanced)
+----------------------
+
+Track Sample IDs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: guess
+
+   callback {
+     summarize_images {
+       selection_strategy {
+         track_sample_ids {
+           input_layer_name: "input"
+           num_tracked_images: 10
+         }
+         image_source_layer_name: "reconstruction"
+         epoch_interval: 1
+       }
+     }
+   }
+
+
+Categorical Accuracy Strategy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: guess
+
+   # Set up categorical accuracy layer
+   layer {
+    parents: "prob label"
+    name: "accuracy"
+    data_layout: "data_parallel"
+    categorical_accuracy {}
+   }
+
+   # Set up callback
+   callback {
+     summarize_images {
+       selection_strategy {
+         categorical_accuracy {
+           cat_accuracy_layer_name: "accuracy"
+           num_images: 10
+         }
+         image_source_layer_name: "images"
+         epoch_interval: 1
+         img_format: ".jpg"
+       }
+     }
+   }
+
+.. toctree::
+   :hidden:
+
+   selection_strategy/categorical_accuracy_strategy.rst
+   selection_strategy/track_sample_ids_strategy.rst
diff --git a/docs/index.rst b/docs/index.rst
index e4603712d64..bb56b6ccf56 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -31,6 +31,7 @@ Users are advised to view `the Doxygen API Documentation
 
    building_lbann
    running_lbann
+   callbacks
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/running_lbann.rst b/docs/running_lbann.rst
index 124d9d28cbf..5223d55cad2 100644
--- a/docs/running_lbann.rst
+++ b/docs/running_lbann.rst
@@ -148,7 +148,9 @@ Model components
 
 + Callback: Function that is performed at various points during an
   experiment. Callbacks are helpful for reporting, debugging, and
-  performing advanced training techniques.
+  performing advanced training techniques. Please consult the :ref:
+  `Callback<callbacks>` documentation for detailed descriptions of
+  the callbacks.
 
   - This is the natural home for experimental training
     techniques.

From 86d9c72f5078369102a137ff4459bb71ab5a08b4 Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Mon, 31 Aug 2020 23:07:55 -0700
Subject: [PATCH 15/36] Update spack environment format (#1608)

* Updating the spack environments to be compatible with Spack v0.15.4+.

* Updated the standard packages in install script to be compatible with
Spack v0.15.4+ packages.yaml and environment format.

* Bumped the minimum spack version

* Require C++14 for CUB compliance, etc.

* Remove whitespaces in spack environments that cause errors in hash generation

* Removed problematic whitespace.
---
 CMakeLists.txt                                |   6 +-
 scripts/build_lbann_from_source.sh            |   2 +-
 scripts/install_lbann.sh                      |  17 ++-
 spack_environments/llnl_lc/compilers.sh       |   9 +-
 .../externals-linux-rhel7-broadwell.sh        |  91 ++++++++-------
 .../llnl_lc/externals-linux-rhel7-haswell.sh  |  91 ++++++++-------
 .../externals-linux-rhel7-ivybridge.sh        |  91 ++++++++-------
 .../llnl_lc/externals-linux-rhel7-power8le.sh |  89 ++++++++-------
 .../llnl_lc/externals-linux-rhel7-power9le.sh |  85 +++++++-------
 spack_environments/nersc/compilers.sh         |   3 +-
 .../externals-cray-cnl7-skylake_avx512.sh     | 104 ++++++++++--------
 spack_environments/osx/compilers.sh           |  10 +-
 .../osx/externals-darwin-mojave-skylake.sh    |  53 +++++----
 .../std_versions_and_variants.sh              |  43 ++++----
 14 files changed, 394 insertions(+), 300 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65a5e3efbf8..4ed8c4b5250 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,9 +26,9 @@ if (NOT DEFINED BUILD_SHARED_LIBS)
   set(BUILD_SHARED_LIBS ON)
 endif ()
 
-# Build with at least C++11 standard; allow newer standards.
+# Build with at least C++14 standard; allow newer standards.
 if (NOT CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD EQUAL 98)
-  set(CMAKE_CXX_STANDARD 11)
+  set(CMAKE_CXX_STANDARD 14)
   set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
 endif ()
 
@@ -264,7 +264,7 @@ if (LBANN_HAS_CUDA)
   enable_language(CUDA)
 
   if (NOT CMAKE_CUDA_STANDARD OR CMAKE_CUDA_STANDARD EQUAL 98)
-    set(CMAKE_CUDA_STANDARD 11)
+    set(CMAKE_CUDA_STANDARD 14)
   endif ()
 
   set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
diff --git a/scripts/build_lbann_from_source.sh b/scripts/build_lbann_from_source.sh
index 3320cbcf355..14e9d0e6c66 100755
--- a/scripts/build_lbann_from_source.sh
+++ b/scripts/build_lbann_from_source.sh
@@ -7,7 +7,7 @@ if [ -n "${SPACK_ROOT}" ]; then
 fi
 
 SPACK_VERSION=$(spack --version | sed 's/-.*//g')
-MIN_SPACK_VERSION=0.13.3
+MIN_SPACK_VERSION=0.15.4
 
 source $(dirname ${BASH_SOURCE})/utilities.sh
 
diff --git a/scripts/install_lbann.sh b/scripts/install_lbann.sh
index c49460aeaef..9659661903b 100755
--- a/scripts/install_lbann.sh
+++ b/scripts/install_lbann.sh
@@ -7,7 +7,7 @@ if [ -n "${SPACK_ROOT}" ]; then
 fi
 
 SPACK_VERSION=$(spack --version | sed 's/-.*//g')
-MIN_SPACK_VERSION=0.13.3
+MIN_SPACK_VERSION=0.15.4
 
 source $(dirname ${BASH_SOURCE})/utilities.sh
 
@@ -258,29 +258,26 @@ ${STD_PACKAGES}
 
     aluminum:
       buildable: true
-      version: [0.4.0]
+      version:
+      - 0.4.0
       ${AL_VARIANTS}
       providers: {}
-      paths: {}
-      modules: {}
       compiler: []
       target: []
     hydrogen:
       buildable: true
-      version: [1.4.0]
+      version:
+      - 1.4.0
       ${HYDROGEN_VARIANTS}
       providers: {}
-      paths: {}
-      modules: {}
       compiler: []
       target: []
     dihydrogen:
       buildable: true
-      version: [master]
+      version:
+      - master
       ${DIHYDROGEN_VARIANTS}
       providers: {}
-      paths: {}
-      modules: {}
       compiler: []
       target: []
 
diff --git a/spack_environments/llnl_lc/compilers.sh b/spack_environments/llnl_lc/compilers.sh
index 30c6e25f828..a2a5c06baaf 100644
--- a/spack_environments/llnl_lc/compilers.sh
+++ b/spack_environments/llnl_lc/compilers.sh
@@ -1,14 +1,19 @@
 #!/bin/sh
 
 COMPILER_ALL_PACKAGES=$(cat <<EOF
-      compiler: [gcc@7.3.0 arch=linux-rhel7-broadwell, gcc@7.3.0 arch=linux-rhel7-haswell, gcc@7.3.1 arch=linux-rhel7-power9le, gcc@7.3.1 arch=linux-rhel7-power8le]
+      compiler:
+        - gcc@7.3.0 arch=linux-rhel7-broadwell
+        - gcc@7.3.0 arch=linux-rhel7-haswell
+        - gcc@7.3.1 arch=linux-rhel7-power9le
+        - gcc@7.3.1 arch=linux-rhel7-power8le
 EOF
 )
 
 COMPILER_DEFINITIONS=$(cat <<EOF
   compilers:
   - compiler:
-      environment: {}
+      environment:
+        unset: []
       extra_rpaths: []
       flags: {}
       modules: []
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh b/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
index 9da7e1cca0b..28c39ed8cb8 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
@@ -3,13 +3,14 @@
 EXTERNAL_ALL_PACKAGES=$(cat <<EOF
     all:
       providers:
-        mpi: [mvapich2@2.3 arch=linux-rhel7-broadwell]
-        lapack: [openblas threads=openmp]
-        blas: [openblas threasd=openmp]
+        mpi:
+          - mvapich2@2.3 arch=linux-rhel7-broadwell
+        lapack:
+          - openblas threads=openmp
+        blas:
+          - openblas threasd=openmp
       buildable: true
       version: []
-      paths: {}
-      modules: {}
 EOF
 )
 
@@ -17,59 +18,71 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     cmake::
       buildable: True
       variants: ~openssl ~ncurses
-      version: [3.18.0]
-      modules:
-        cmake@3.18.0 arch=linux-rhel7-broadwell:  cmake/3.18.0
-
+      version:
+      - 3.18.0
+      externals:
+      - spec: cmake@3.18.0 arch=linux-rhel7-broadwell
+        modules:
+        -  cmake/3.18.0
     cuda::
       buildable: False
-      version: [10.1.168]
-      modules:
-        cuda@10.1.168 arch=linux-rhel7-broadwell: cuda/10.1.168
-
+      version:
+      - 10.1.168
+      externals:
+      - spec: cuda@10.1.168 arch=linux-rhel7-broadwell
+        modules:
+        - cuda/10.1.168
     cudnn::
       buildable: true
-      version: [8.0.2.39-10.1-linux-x64]
-
+      version:
+      - 8.0.2.39-10.1-linux-x64
     gcc::
-       buildable: False
-       version: [7.3.0]
-       modules:
-         gcc@7.3.0 arch=linux-rhel7-broadwell: gcc/7.3.0
-
+      buildable: False
+      version:
+      - 7.3.0
+      externals:
+      - spec:  gcc@7.3.0 arch=linux-rhel7-broadwell
+        modules:
+        - gcc/7.3.0
     hwloc::
       buildable: False
-      version: [2.0.2]
-      paths:
-        hwloc@2.0.2 arch=linux-rhel7-broadwell: /usr/lib64/libhwloc.so
-
+      version:
+      - 2.0.2
+      externals:
+      - spec: hwloc@2.0.2 arch=linux-rhel7-broadwell
+        prefix: /usr/lib64/libhwloc.so
     mvapich2::
       buildable: True
-      version: [2.3]
-      paths:
-        mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-broadwell: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/
-
+      version:
+      - 2.3
+      externals:
+      - spec: mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-broadwell
+        prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/
     openblas::
       buildable: True
       variants: threads=openmp
-      version: [0.3.10]
-
+      version:
+      - 0.3.10
     opencv::
       buildable: true
       variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
-      version: [4.1.0]
-
+      version:
+      - 4.1.0
     python::
       buildable: True
       variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
-      version: [3.7.2]
-      modules:
-        python@3.7.2 arch=linux-rhel7-broadwell: python/3.7.2
-
+      version:
+      - 3.7.2
+      externals:
+      - spec: python@3.7.2 arch=linux-rhel7-broadwell
+        modules:
+        - python/3.7.2
     rdma-core::
       buildable: False
-      version: [20]
-      paths:
-        rdma-core@20 arch=linux-rhel7-broadwell: /usr
+      version:
+      - 20
+      externals:
+      - spec: rdma-core@20 arch=linux-rhel7-broadwell
+        prefix: /usr
 EOF
 )
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh b/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
index fceb50c9220..833877e0574 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
@@ -3,13 +3,14 @@
 EXTERNAL_ALL_PACKAGES=$(cat <<EOF
     all:
       providers:
-        mpi: [mvapich2@2.3 arch=linux-rhel7-haswell]
-        lapack: [openblas threads=openmp]
-        blas: [openblas threasd=openmp]
+        mpi:
+          - mvapich2@2.3 arch=linux-rhel7-haswell
+        lapack:
+          - openblas threads=openmp
+        blas:
+          - openblas threasd=openmp
       buildable: true
       version: []
-      paths: {}
-      modules: {}
 EOF
 )
 
@@ -17,59 +18,71 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     cmake::
       buildable: True
       variants: ~openssl ~ncurses
-      version: [3.18.0]
-      modules:
-        cmake@3.18.0 arch=linux-rhel7-haswell:  cmake/3.18.0
-
+      version:
+      - 3.18.0
+      externals:
+      - spec: cmake@3.18.0 arch=linux-rhel7-haswell
+        modules:
+        -  cmake/3.18.0
     cuda::
       buildable: False
-      version: [10.2.89]
-      modules:
-        cuda@10.2.89 arch=linux-rhel7-haswell: cuda/10.2.89
-
+      version:
+      - 10.2.89
+      externals:
+      - spec: cuda@10.2.89 arch=linux-rhel7-haswell
+        modules:
+        - cuda/10.2.89
     cudnn::
       buildable: true
-      version: [7.6.5.32-10.1-linux-x64]
-
+      version:
+      - 7.6.5.32-10.1-linux-x64
     gcc::
-       buildable: False
-       version: [7.3.0]
-       modules:
-         gcc@7.3.0 arch=linux-rhel7-haswell: gcc/7.3.0
-
+      buildable: False
+      version:
+      - 7.3.0
+      externals:
+      - spec: gcc@7.3.0 arch=linux-rhel7-haswell
+        modules:
+        - gcc/7.3.0
     hwloc::
       buildable: False
-      version: [2.0.2]
-      paths:
-        hwloc@2.0.2 arch=linux-rhel7-haswell: /usr/lib64/libhwloc.so
-
+      version:
+      - 2.0.2
+      externals:
+      - spec: hwloc@2.0.2 arch=linux-rhel7-haswell
+        prefix: /usr/lib64/libhwloc.so
     mvapich2::
       buildable: True
-      version: [2.3]
-      paths:
-        mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-haswell: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/
-
+      version:
+      - 2.3
+      externals:
+      - spec: mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-haswell
+        prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/
     openblas::
       buildable: True
       variants: threads=openmp
-      version: [0.3.10]
-
+      version:
+      - 0.3.10
     opencv::
       buildable: true
       variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
-      version: [4.1.0]
-
+      version:
+      - 4.1.0
     python::
       buildable: True
       variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
-      version: [3.7.2]
-      modules:
-        python@3.7.2 arch=linux-rhel7-haswell: python/3.7.2
-
+      version:
+      - 3.7.2
+      externals:
+      - spec: python@3.7.2 arch=linux-rhel7-haswell
+        modules:
+        - python/3.7.2
     rdma-core::
       buildable: False
-      version: [20]
-      paths:
-        rdma-core@20 arch=linux-rhel7-haswell: /usr
+      version:
+      - 20
+      externals:
+      - spec: rdma-core@20 arch=linux-rhel7-haswell
+        prefix: /usr
 EOF
 )
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-ivybridge.sh b/spack_environments/llnl_lc/externals-linux-rhel7-ivybridge.sh
index 339bcc22c9c..cfa211c21d6 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-ivybridge.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-ivybridge.sh
@@ -3,13 +3,14 @@
 EXTERNAL_ALL_PACKAGES=$(cat <<EOF
     all:
       providers:
-        mpi: [mvapich2@2.3 arch=linux-rhel7-ivybridge]
-        lapack: [openblas threads=openmp]
-        blas: [openblas threasd=openmp]
+        mpi:
+          - mvapich2@2.3 arch=linux-rhel7-ivybridge
+        lapack:
+          - openblas threads=openmp
+        blas:
+          - openblas threasd=openmp
       buildable: true
       version: []
-      paths: {}
-      modules: {}
 EOF
 )
 
@@ -17,59 +18,71 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     cmake::
       buildable: True
       variants: ~openssl ~ncurses
-      version: [3.18.0]
-      modules:
-        cmake@3.18.0 arch=linux-rhel7-ivybridge:  cmake/3.18.0
-
+      version:
+      - 3.18.0
+      externals:
+      - spec: cmake@3.18.0 arch=linux-rhel7-ivybridge
+        modules:
+        -  cmake/3.18.0
     cuda::
       buildable: False
-      version: [10.2.89]
-      modules:
-        cuda@10.2.89 arch=linux-rhel7-ivybridge: cuda/10.2.89
-
+      version:
+      - 10.2.89
+      externals:
+      - spec: cuda@10.2.89 arch=linux-rhel7-ivybridge
+        modules:
+        - cuda/10.2.89
     cudnn::
       buildable: true
-      version: [8.0.0.180-10.2-linux-x64]
-
+      version:
+      - 8.0.0.180-10.2-linux-x64
     gcc::
-       buildable: False
-       version: [7.3.0]
-       modules:
-         gcc@7.3.0 arch=linux-rhel7-ivybridge: gcc/7.3.0
-
+      buildable: False
+      version:
+      - 7.3.0
+      externals:
+      - spec: gcc@7.3.0 arch=linux-rhel7-ivybridge
+        modules:
+        - gcc/7.3.0
     hwloc::
       buildable: False
-      version: [2.0.2]
-      paths:
-        hwloc@2.0.2 arch=linux-rhel7-ivybridge: /usr/lib64/libhwloc.so
-
+      version:
+      - 2.0.2
+      externals:
+      - spec: hwloc@2.0.2 arch=linux-rhel7-ivybridge
+        prefix: /usr/lib64/libhwloc.so
     mvapich2::
       buildable: True
-      version: [2.3]
-      paths:
-        mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-ivybridge: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/
-
+      version:
+      - 2.3
+      externals:
+      - spec: mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-ivybridge
+        prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/
     openblas::
       buildable: True
       variants: threads=openmp
-      version: [0.3.10]
-
+      version:
+      - 0.3.10
     opencv::
       buildable: true
       variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
-      version: [4.1.0]
-
+      version:
+      - 4.1.0
     python::
       buildable: True
       variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
-      version: [3.7.2]
-      modules:
-        python@3.7.2 arch=linux-rhel7-ivybridge: python/3.7.2
-
+      version:
+      - 3.7.2
+      externals:
+      - spec: python@3.7.2 arch=linux-rhel7-ivybridge
+        modules:
+        - python/3.7.2
     rdma-core::
       buildable: False
-      version: [20]
-      paths:
-        rdma-core@20 arch=linux-rhel7-ivybridge: /usr
+      version:
+      - 20
+      externals:
+      - spec: rdma-core@20 arch=linux-rhel7-ivybridge
+        prefix: /usr
 EOF
 )
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh b/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
index e412e020233..0f8e9bd6301 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
@@ -3,13 +3,14 @@
 EXTERNAL_ALL_PACKAGES=$(cat <<EOF
     all:
       providers:
-        mpi: [spectrum-mpi@rolling-release arch=linux-rhel7-power8le]
-        lapack: [openblas threads=openmp]
-        blas: [openblas threasd=openmp]
+        mpi:
+          - spectrum-mpi@rolling-release arch=linux-rhel7-power8le
+        lapack:
+          - openblas threads=openmp
+        blas:
+          - openblas threasd=openmp
       buildable: true
       version: []
-      paths: {}
-      modules: {}
 EOF
 )
 
@@ -17,59 +18,71 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     cmake::
       buildable: True
       variants: ~openssl ~ncurses
-      version: [3.14.5]
-      modules:
-        cmake@3.14.5 arch=linux-rhel7-power8le: cmake/3.14.5
-
+      version:
+      - 3.14.5
+      externals:
+      - spec: cmake@3.14.5 arch=linux-rhel7-power8le
+        modules:
+        - cmake/3.14.5
     cuda::
       buildable: False
-      version: [10.2.89]
-      modules:
-        cuda@10.2.89 arch=linux-rhel7-power8le: cuda/10.2.89
-
+      version:
+      - 10.2.89
+      externals:
+      - spec: cuda@10.2.89 arch=linux-rhel7-power8le
+        modules:
+        - cuda/10.2.89
     cudnn::
       buildable: true
-      version: [7.6.5.32-10.1-linux-ppc64le]
-
+      version:
+      - 7.6.5.32-10.1-linux-ppc64le
     gcc::
        buildable: False
-       version: [7.3.1]
-       modules:
-         gcc@7.3.1 arch=linux-rhel7-power8le: gcc/7.3.1
-
+       version:
+      - 7.3.1
+       externals:
+      - spec:  gcc@7.3.1 arch=linux-rhel7-power8le
+        modules:
+        - gcc/7.3.1
     hwloc::
       buildable: False
-      version: [2.0.2]
-      paths:
-        hwloc@2.0.2 arch=linux-rhel7-power8le: /usr/lib64/libhwloc.so
-
+      version:
+      - 2.0.2
+      externals:
+      - spec: hwloc@2.0.2 arch=linux-rhel7-power8le
+        prefix: /usr/lib64/libhwloc.so
     openblas::
       buildable: True
       variants: threads=openmp ~avx2 ~avx512
-      version: [0.3.10]
-
+      version:
+      - 0.3.10
     opencv::
       buildable: true
       variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png+powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab+vsx~vtk+zlib
-      version: [4.1.0]
-
+      version:
+      - 4.1.0
     python::
       buildable: True
       variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
-      version: [3.7.2]
-      modules:
-        python@3.7.2 arch=linux-rhel7-power8le: python/3.7.2
-
+      version:
+      - 3.7.2
+      externals:
+      - spec: python@3.7.2 arch=linux-rhel7-power8le
+        modules:
+        - python/3.7.2
     rdma-core::
       buildable: False
-      version: [20]
-      paths:
-        rdma-core@20 arch=linux-rhel7-power8le: /usr
-
+      version:
+      - 20
+      externals:
+      - spec: rdma-core@20 arch=linux-rhel7-power8le
+        prefix: /usr
     spectrum-mpi::
       buildable: False
-      version: [rolling-release]
-      paths:
-        spectrum-mpi@rolling-release %gcc@7.3.1 arch=linux-rhel7-power8le: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1
+      version:
+      - rolling-release
+      externals:
+      - spec: spectrum-mpi@rolling-release %gcc@7.3.1 arch=linux-rhel7-power8le
+        prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1
 EOF
 )
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh b/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
index 673d955434e..0575ec192ee 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
@@ -3,13 +3,14 @@
 EXTERNAL_ALL_PACKAGES=$(cat <<EOF
     all:
       providers:
-        mpi: [spectrum-mpi@rolling-release arch=linux-rhel7-power9le]
-        lapack: [openblas threads=openmp]
-        blas: [openblas threasd=openmp]
+        mpi:
+          - spectrum-mpi@rolling-release arch=linux-rhel7-power9le
+        lapack:
+          - openblas threads=openmp
+        blas:
+          - openblas threasd=openmp
       buildable: true
       version: []
-      paths: {}
-      modules: {}
 EOF
 )
 
@@ -18,58 +19,68 @@ EXTERNAL_PACKAGES=$(cat <<EOF
       buildable: True
       variants: ~openssl ~ncurses
       version: [3.18.0]
-      modules:
-        cmake@3.18.0 arch=linux-rhel7-power9le: cmake/3.18.0
-
+      externals:
+      - spec: cmake@3.18.0 arch=linux-rhel7-power9le
+        modules:
+        - cmake/3.18.0
     cuda::
       buildable: False
       version: [10.1.168]
-      modules:
-        cuda@10.1.168 arch=linux-rhel7-power9le: cuda/10.1.168
-
+      externals:
+      - spec: cuda@10.1.168 arch=linux-rhel7-power9le
+        modules:
+        - cuda/10.1.168
     cudnn::
       buildable: true
-      version: [8.0.2.39-10.1-linux-ppc64le]
-
+      version:
+      - 8.0.2.39-10.1-linux-ppc64le
     gcc::
-       buildable: False
-       version: [7.3.1]
-       modules:
-         gcc@7.3.1 arch=linux-rhel7-power9le: gcc/7.3.1
-
+      buildable: False
+      version:
+      - 7.3.1
+      externals:
+      - spec: gcc@7.3.1 arch=linux-rhel7-power9le
+        modules:
+        - gcc/7.3.1
     hwloc::
       buildable: False
-      version: [2.0.2]
-      paths:
-        hwloc@2.0.2 arch=linux-rhel7-power9le: /usr/lib64/libhwloc.so
-
+      version:
+      - 2.0.2
+      externals:
+      - spec: hwloc@2.0.2 arch=linux-rhel7-power9le
+        prefix: /usr/lib64/libhwloc.so
     openblas::
       buildable: True
       variants: threads=openmp ~avx2 ~avx512
-      version: [0.3.10]
-
+      version:
+      - 0.3.10
     opencv::
       buildable: true
       variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png+powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab+vsx~vtk+zlib
-      version: [4.1.0]
-
+      version:
+      - 4.1.0
     python::
       buildable: True
       variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
-      version: [3.7.2]
-      modules:
-        python@3.7.2 arch=linux-rhel7-power9le: python/3.7.2
-
+      version:
+      - 3.7.2
+      externals:
+      - spec: python@3.7.2 arch=linux-rhel7-power9le
+        modules:
+        - python/3.7.2
     rdma-core::
       buildable: False
-      version: [20]
-      paths:
-        rdma-core@20 arch=linux-rhel7-power9le: /usr
-
+      version:
+      - 20
+      externals:
+      - spec: rdma-core@20 arch=linux-rhel7-power9le
+        prefix: /usr
     spectrum-mpi::
       buildable: False
-      version: [rolling-release]
-      paths:
-        spectrum-mpi@rolling-release %gcc@7.3.1 arch=linux-rhel7-power9le: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1
+      version:
+      - rolling-release
+      externals:
+      - spec: spectrum-mpi@rolling-release %gcc@7.3.1 arch=linux-rhel7-power9le
+        prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1
 EOF
 )
diff --git a/spack_environments/nersc/compilers.sh b/spack_environments/nersc/compilers.sh
index 6a38e114464..f39e847d3dd 100644
--- a/spack_environments/nersc/compilers.sh
+++ b/spack_environments/nersc/compilers.sh
@@ -1,7 +1,8 @@
 #!/bin/sh
 
 COMPILER_ALL_PACKAGES=$(cat <<EOF
-      compiler: [gcc@8.2.0 arch=cray-cnl7-skylake_avx512]
+      compiler:
+          - gcc@8.2.0 arch=cray-cnl7-skylake_avx512
 EOF
 )
 
diff --git a/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh b/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh
index b78664362a8..7ee5d34e627 100644
--- a/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh
+++ b/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh
@@ -3,13 +3,14 @@
 EXTERNAL_ALL_PACKAGES=$(cat <<EOF
     all:
       providers:
-        mpi: [mvapich2@2.3 arch=cray-cnl7-skylake_avx512]
-        lapack: [openblas threads=openmp]
-        blas: [openblas threasd=openmp]
+        mpi:
+          - mvapich2@2.3 arch=cray-cnl7-skylake_avx512
+        lapack:
+          - openblas threads=openmp
+        blas:
+          - openblas threasd=openmp
       buildable: true
       version: []
-      paths: {}
-      modules: {}
 EOF
 )
 
@@ -17,69 +18,82 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     cmake::
       buildable: True
       variants: ~openssl ~ncurses
-      version: [3.14.4]
-      modules:
-        cmake@3.14.4 arch=cray-cnl7-skylake_avx512: cmake/3.14.4
-
+      version:
+      - 3.14.4
+      externals:
+      - spec: cmake@3.14.4 arch=cray-cnl7-skylake_avx512
+        modules:
+        - cmake/3.14.4
     cuda::
       buildable: False
-      version: [10.2.89]
-      modules:
-        cuda@10.2.89 arch=cray-cnl7-skylake_avx512: cuda/10.2.89
-
+      version:
+      - 10.2.89
+      externals:
+      - spec: cuda@10.2.89 arch=cray-cnl7-skylake_avx512
+        modules:
+        - cuda/10.2.89
     cudnn::
       buildable: true
-      version: [7.6.5.32-10.2-linux-x64]
-
+      version:
+      - 7.6.5.32-10.2-linux-x64
     gcc::
-       buildable: False
-       version: [8.2.0]
-       modules:
-         gcc@8.2.0 arch=cray-cnl7-skylake_avx512: gcc/8.2.0
-
+      buildable: False
+      version:
+      - 8.2.0
+       externals:
+      - spec:  gcc@8.2.0 arch=cray-cnl7-skylake_avx512
+        modules:
+        - gcc/8.2.0
     gettext::
       buildable: False
-      version: [0.19.8.1]
-      paths:
-        gettext@0.19.8.1 arch=cray-cnl7-skylake_avx512: /usr
-
+      version:
+      - 0.19.8.1
+      externals:
+      - spec: gettext@0.19.8.1 arch=cray-cnl7-skylake_avx512
+        prefix: /usr
     hwloc::
       buildable: False
-      version: [1.11.8]
-      paths:
-        hwloc@1.11.8 arch=cray-cnl7-skylake_avx512: /usr/lib64/libhwloc.so
-
+      version:
+      - 1.11.8
+      externals:
+      - spec: hwloc@1.11.8 arch=cray-cnl7-skylake_avx512
+        prefix: /usr/lib64/libhwloc.so
     mvapich2::
       buildable: False
-      version: [2.3.2]
-      modules:
-        mvapich2@2.3.2%gcc@8.2.0 arch=cray-cnl7-skylake_avx512: mvapich2/2.3.2
-
+      version:
+      - 2.3.2
+      externals:
+      - spec: mvapich2@2.3.2%gcc@8.2.0 arch=cray-cnl7-skylake_avx512
+        modules:
+        - mvapich2/2.3.2
     openblas::
       buildable: True
       variants: threads=openmp
-      version: [0.3.10]
-
+      version:
+      - 0.3.10
     opencv::
       buildable: true
       variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
-      version: [4.1.0]
-
+      version:
+      - 4.1.0
     python::
       buildable: True
       variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
-      version: [3.7.4]
-
+      version:
+      - 3.7.4
     readline::
       buildable: False
-      version: [8.0]
-      paths:
-        readline@8.0 arch=cray-cnl7-skylake_avx512: /lib64
-
+      version:
+      - 8.0
+      externals:
+      - spec: readline@8.0 arch=cray-cnl7-skylake_avx512
+        prefix: /lib64
     rdma-core::
       buildable: False
-      version: [20]
-      paths:
-        rdma-core@20 arch=cray-cnl7-skylake_avx512: /usr
+      version:
+      - 20
+      externals:
+      - spec: rdma-core@20 arch=cray-cnl7-skylake_avx512
+        prefix: /usr
 EOF
 )
diff --git a/spack_environments/osx/compilers.sh b/spack_environments/osx/compilers.sh
index ac7cb978fa2..0a63799c006 100644
--- a/spack_environments/osx/compilers.sh
+++ b/spack_environments/osx/compilers.sh
@@ -1,14 +1,17 @@
 #!/bin/sh
 
 COMPILER_ALL_PACKAGES=$(cat <<EOF
-      compiler: [clang@9.0.1 arch=darwin-mojave-skylake, clang@9.0.0 arch=darwin-mojave-skylake]
+      compiler:
+        - clang@9.0.1 arch=darwin-mojave-skylake
+        - clang@9.0.0 arch=darwin-mojave-skylake
 EOF
 )
 
 COMPILER_DEFINITIONS=$(cat <<EOF
   compilers:
   - compiler:
-      environment: {}
+      environment:
+        unset: []
       extra_rpaths: []
       flags: {}
       modules: []
@@ -21,7 +24,8 @@ COMPILER_DEFINITIONS=$(cat <<EOF
       spec: clang@9.0.1
       target: x86_64
   - compiler:
-      environment: {}
+      environment:
+        unset: []
       extra_rpaths: []
       flags: {}
       modules: []
diff --git a/spack_environments/osx/externals-darwin-mojave-skylake.sh b/spack_environments/osx/externals-darwin-mojave-skylake.sh
index c37423cc7fa..8b4fc67ef2f 100644
--- a/spack_environments/osx/externals-darwin-mojave-skylake.sh
+++ b/spack_environments/osx/externals-darwin-mojave-skylake.sh
@@ -3,13 +3,14 @@
 EXTERNAL_ALL_PACKAGES=$(cat <<EOF
     all:
       providers:
-        mpi: [openmpi@4.0 arch=darwin-mojave-skylake]
-        blas: [veclibfort]
-        lapack: [veclibfort]
+        mpi:
+          - openmpi@4.0 arch=darwin-mojave-skylake
+        blas:
+          - veclibfort
+        lapack:
+          - veclibfort
       buildable: true
       version: []
-      paths: {}
-      modules: {}
 EOF
 )
 
@@ -17,35 +18,39 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     cmake::
       buildable: True
       variants: ~openssl ~ncurses
-      version: [3.16.2]
-      paths:
-        cmake@3.16.2 arch=darwin-mojave-skylake:  /usr/local/
-
+      version:
+      - 3.16.2
+      externals:
+      - spec: cmake@3.16.2 arch=darwin-mojave-skylake
+        prefix:  /usr/local/
     hwloc::
       buildable: True
-      version: [2.0.2]
-
+      version:
+      - 2.0.2
     llvm::
-       buildable: False
-       variants: +clang
-       version: [9.0.0]
-       paths:
-         llvm@9.0.0 arch=darwin-mojave-skylake: /usr/local/Cellar/llvm/9.0.0_1/
-
+      buildable: False
+      variants: +clang
+      version:
+      - 9.0.0
+      externals:
+      - spec: llvm@9.0.0 arch=darwin-mojave-skylake
+        prefix: /usr/local/Cellar/llvm/9.0.0_1/
     opencv::
       buildable: true
       variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
-      version: [4.1.0]
-
+      version:
+      - 4.1.0
     openmpi:
       buildable: False
-      version: [4.0]
-      paths:
-        openmpi@4.0 arch=darwin-mojave-skylake: /usr/local/
-
+      version:
+      - 4.0
+      externals:
+      - spec: openmpi@4.0 arch=darwin-mojave-skylake
+        prefix: /usr/local/
     python::
       buildable: True
       variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
-      version: [3.7.2]
+      version:
+      - 3.7.2
 EOF
 )
diff --git a/spack_environments/std_versions_and_variants.sh b/spack_environments/std_versions_and_variants.sh
index c54558baa34..2cb593f0ffa 100644
--- a/spack_environments/std_versions_and_variants.sh
+++ b/spack_environments/std_versions_and_variants.sh
@@ -3,43 +3,44 @@
 STD_PACKAGES=$(cat <<EOF
     cereal::
       buildable: true
-      version: [1.3.0]
-
+      version:
+        - 1.3.0
     conduit::
       buildable: true
       variants: ~doc~doxygen+hdf5~hdf5_compat+mpi+python+shared~silo
-      version: [0.5.1]
-
+      version:
+        - 0.5.1
     cnpy::
       buildable: true
       variants: build_type=RelWithDebInfo
-      version: [master]
-
+      version:
+        - master
     cub::
       buildable: true
-      version: [1.8.0]
-
+      version:
+        - 1.9.10
     nccl::
       buildable: true
-      version: [2.7.8-1]
-
+      version:
+        - 2.7.8-1
     protobuf::
       buildable: True
       variants: build_type=Release +shared
-      version: [3.10.0]
-
+      version:
+        - 3.10.0
     py-numpy::
       buildable: True
-      version: [1.16.2]
-
+      version:
+        - 1.16.2
     py-protobuf::
       buildable: True
       variants: +cpp
-      version: [3.10.0]
-
+      version:
+        - 3.10.0
     zlib::
       buildable: True
-      version: [1.2.11]
+      version:
+        - 1.2.11
 EOF
 )
 
@@ -68,7 +69,9 @@ STD_MODULES=$(cat <<EOF
           '^netlib-lapack': netlib
         filter:
           # Exclude changes to any of these variables
-          environment_blacklist: ['CPATH', 'LIBRARY_PATH']
+          environment_blacklist:
+          - 'CPATH'
+          - 'LIBRARY_PATH'
       ^python:
         autoload:  'direct'
     tcl:
@@ -90,7 +93,9 @@ STD_MODULES=$(cat <<EOF
           '^netlib-lapack': netlib
         filter:
           # Exclude changes to any of these variables
-          environment_blacklist: ['CPATH', 'LIBRARY_PATH']
+          environment_blacklist:
+          - 'CPATH'
+          - 'LIBRARY_PATH'
       ^python:
         autoload:  'direct'
 EOF

From 9e29571d8886f127150672bd1b4972f75e84d0ab Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Tue, 1 Sep 2020 12:01:27 -0700
Subject: [PATCH 16/36] Support large numbers of channels in GPU batchnorm
 layer (#1610)

---
 .../test_unit_layer_batch_normalization.py    | 179 +++++++
 .../regularizers/batch_normalization.cu       | 454 ++++++++++--------
 2 files changed, 433 insertions(+), 200 deletions(-)
 create mode 100644 bamboo/unit_tests/test_unit_layer_batch_normalization.py

diff --git a/bamboo/unit_tests/test_unit_layer_batch_normalization.py b/bamboo/unit_tests/test_unit_layer_batch_normalization.py
new file mode 100644
index 00000000000..cbc3de3230d
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_batch_normalization.py
@@ -0,0 +1,179 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20200827)
+_num_samples = 29
+_sample_dims = (7,5,3)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: We want to use gradient checking to verify that error
+    # signals are correct. To do this, we zero-initialize a weights
+    # object, construct a zero-valued tensor, and add it to the
+    # input. To make sure that batchnorm is non-trivial, we multiply
+    # the zero-valued tensor by the mini-batch index.
+    x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims))
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x0 = lbann.WeightsLayer(weights=x_weights,
+                            dims=tools.str_list(_sample_dims))
+    x1 = lbann.Divide(lbann.MiniBatchIndex(), lbann.MiniBatchSize())
+    x1 = lbann.Tessellate(lbann.Reshape(x1, dims='1 1 1'), dims=tools.str_list(_sample_dims))
+    x = lbann.Sum(x, lbann.Multiply(x0, x1))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Local statistics
+    # ------------------------------------------
+
+    # LBANN implementation
+    decay = 0.9
+    epsilon = 1e-5
+    x = x_lbann
+    y = lbann.BatchNormalization(x,
+                                 decay=decay,
+                                 epsilon=epsilon,
+                                 scale_init=1.5,
+                                 bias_init=0.25,
+                                 data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='local statistics'))
+
+    # ------------------------------------------
+    # Global statistics
+    # ------------------------------------------
+
+    # LBANN implementation
+    decay = 0.9
+    epsilon = 1e-5
+    x = x_lbann
+    y = lbann.BatchNormalization(x,
+                                 decay=decay,
+                                 epsilon=epsilon,
+                                 scale_init=0.8,
+                                 bias_init=-0.25,
+                                 statistics_group_size=-1,
+                                 data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='global statistics'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 1
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu
index 4f6a44e5a30..2683e8458f1 100644
--- a/src/layers/regularizers/batch_normalization.cu
+++ b/src/layers/regularizers/batch_normalization.cu
@@ -33,66 +33,89 @@ namespace lbann {
 
 namespace {
 
-/** CUDA kernel to compute channel sums.
- *  Sums and squares of sums are used to compute mean and variance.
+/** Functor for adding arrays. */
+template <typename T, size_t N>
+struct array_sum
+{
+  using ArrayType = cuda::array<T,N>;
+  __device__ __forceinline__
+  ArrayType operator()(const ArrayType& x, const ArrayType& y)
+  {
+    ArrayType sum;
+#pragma unroll
+    for (size_t i = 0; i < N; ++i) {
+      sum[i] = x[i] + y[i];
+    }
+    return sum;
+  }
+};
+
+/** Accumulate sums and sums of squares for each channel.
+ *
+ *  On input, sums and sqsums are assumed to be filled with zeros.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (channel_size / bsize) x num_channels x 1
  */
-template <El::Int block_size, typename TensorDataType>
-__global__ void channel_sums_kernel(
-  El::Int channel_height,
-  El::Int width,
-  const TensorDataType * __restrict__ data, El::Int data_ldim,
+template <typename TensorDataType, int bdimx>
+__global__ void fp_sums_kernel(
+  int mini_batch_size,
+  int num_channels,
+  int channel_size,
+  const TensorDataType * __restrict__ data, int data_ldim,
         TensorDataType * __restrict__ sums,
         TensorDataType * __restrict__ sqsums) {
 
-  // Indices
-  const El::Int tid = threadIdx.x;
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-
-  // Initialize shared memory
-  __shared__ TensorDataType shared_sums[block_size];
-  __shared__ TensorDataType shared_sqsums[block_size];
-
-  // Compute row sums in shared memory
-  TensorDataType private_sum = 0;
-  TensorDataType private_sqsum = 0;
-  if (gidx < channel_height) {
-    const auto& row = gidx + bidy * channel_height;
-    for (El::Int col = 0; col < width; ++col) {
-      const auto& x = data[row + col * data_ldim];
-      private_sum += x;
-      private_sqsum += x * x;
+  // Indices and dimensions
+  constexpr int bdimy = 1;
+  constexpr int bdimz = 1;
+  const auto& tid = threadIdx.x;
+  const auto& gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const auto& gidy = blockIdx.y;
+  const auto& nthreadsx = blockDim.x * gridDim.x;
+  const auto& nthreadsy = gridDim.y;
+
+  for (int channel = gidy; channel < num_channels; channel += nthreadsy) {
+
+    // Accumulate sums and perform block-wide reduction
+    using array_t = cuda::array<TensorDataType,2>;
+    using array_sum_t = array_sum<TensorDataType,2>;
+    array_t sum_sqsum;
+    sum_sqsum[0] = TensorDataType(0);
+    sum_sqsum[1] = TensorDataType(0);
+    for (int i = gidx; i < channel_size; i += nthreadsx) {
+      for (int j = 0; j < mini_batch_size; ++j) {
+        const auto& x = data[i + channel*channel_size + j*data_ldim];
+        sum_sqsum[0] += x;
+        sum_sqsum[1] += x * x;
+      }
     }
-  }
-  shared_sums[tid] = private_sum;
-  shared_sqsums[tid] = private_sqsum;
-
-  // Compute channel sum with shared memory reduction
-  /// @todo unroll loops
-  for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
-    __syncthreads();
-    if(tid < stride) {
-      shared_sums[tid] += shared_sums[tid + stride];
-      shared_sqsums[tid] += shared_sqsums[tid + stride];
+    sum_sqsum = cuda::block_reduce<bdimx,bdimy,bdimz,array_t,array_sum_t>(sum_sqsum);
+
+    // Output result to global memory
+    if (tid == 0) {
+      cuda::atomic_add(&sums[channel], sum_sqsum[0]);
+      cuda::atomic_add(&sqsums[channel], sum_sqsum[1]);
     }
-  }
 
-  // Output channel sum to global memory
-  if (tid == 0) {
-    cuda::atomic_add(&sums[bidy], shared_sums[0]);
-    cuda::atomic_add(&sqsums[bidy], shared_sqsums[0]);
   }
 
 }
 
-/** CUDA kernel to compute statistics.
+/** Compute statistics for each channel.
+ *
  *  On input, global_mean and global_var are assumed to contain sums
  *  and squares of sums, respectively.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (num_channels / bsize) x 1 x 1
  */
 template <typename TensorDataType>
-__global__ void compute_statistics_kernel(
-  El::Int num_sums,
-  El::Int num_per_sum,
+__global__ void fp_statistics_kernel(
+  int num_sums,
+  int num_per_sum,
   TensorDataType epsilon,
   TensorDataType decay,
   TensorDataType * __restrict__ global_mean,
@@ -100,9 +123,9 @@ __global__ void compute_statistics_kernel(
   TensorDataType * __restrict__ global_running_mean,
   TensorDataType * __restrict__ global_running_var) {
 
-  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int num_threads = blockDim.x * gridDim.x;
-  for (El::Int i = gid; i < num_sums; i += num_threads) {
+  const auto& gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const auto& num_threads = blockDim.x * gridDim.x;
+  for (auto i = gid; i < num_sums; i += num_threads) {
 
     TensorDataType num_per_sum_dt = TensorDataType(num_per_sum);
     // Compute mean and variance
@@ -123,54 +146,79 @@ __global__ void compute_statistics_kernel(
 
 }
 
-/** CUDA kernel to apply batch normalization. */
-template <El::Int block_size, typename TensorDataType>
-__global__ void batch_normalization_kernel(
-  El::Int channel_height,
-  El::Int width,
-  const TensorDataType * __restrict__ global_input, El::Int input_ldim,
+/** Compute outputs.
+ *
+ *  y_i = (x_i - mean) / sqrt(var + epsilon)
+ *
+ *  Block dimensions: bdimx x bdimy x bdimz
+ *
+ *  Grid dimensions: (channel_size / bdimx) x (mini_batch_size / bdimy) x (num_channels / bdimz)
+ *
+ */
+template <typename TensorDataType>
+__global__ void fp_output_kernel(
+  int mini_batch_size,
+  int num_channels,
+  int channel_size,
+  const TensorDataType * __restrict__ global_input, int input_ldim,
   const TensorDataType * __restrict__ global_mean,
   const TensorDataType * __restrict__ global_var,
   TensorDataType epsilon,
   const TensorDataType * __restrict__ global_scale,
   const TensorDataType * __restrict__ global_bias,
-        TensorDataType * __restrict__ global_output, El::Int output_ldim) {
-
-  // Indices
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-
-  // Copy batch normalization parameters to private memory
-  const auto& mean = global_mean[bidy];
-  const auto& var = global_var[bidy];
-  const auto& scale = global_scale[bidy];
-  const auto& bias = global_bias[bidy];
-
-  // Get reciprocal of standard deviation
-  const auto& inv_stdev = cuda::rsqrt(var + epsilon);
-
-  // Apply batch normalization
-  if (gidx < channel_height) {
-    const auto& row = gidx + bidy * channel_height;
-    for (El::Int col = 0; col < width; ++col) {
-      const auto& x = global_input[row + col * input_ldim];
-      const auto& xhat = (x - mean) * inv_stdev;
-      const auto& y = scale * xhat + bias;
-      global_output[row + col * output_ldim] = y;
+        TensorDataType * __restrict__ global_output, int output_ldim) {
+
+  // Indices and dimensions
+  const auto& gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const auto& gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const auto& gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const auto& nthreadsx = blockDim.x * gridDim.x;
+  const auto& nthreadsy = blockDim.y * gridDim.y;
+  const auto& nthreadsz = blockDim.z * gridDim.z;
+
+  for (auto k = gidz; k < num_channels; k += nthreadsz) {
+    const auto& mean = global_mean[k];
+    const auto& var = global_var[k];
+    const auto& inv_stdev = cuda::rsqrt(var + epsilon);
+    const auto& scale = global_scale[k];
+    const auto& bias = global_bias[k];
+    for (auto j = gidy; j < mini_batch_size; j += nthreadsy) {
+      for (auto i = gidx; i < channel_size; i += nthreadsx) {
+        const auto& x = global_input[i + k*channel_size + j*input_ldim];
+        const auto& xhat = (x - mean) * inv_stdev;
+        const auto& y = scale * xhat + bias;
+        global_output[i + k*channel_size + j*output_ldim] = y;
+      }
     }
   }
 
 }
 
-/** CUDA kernel to compute gradients w.r.t. batch norm parameters. */
-template <El::Int block_size, typename TensorDataType>
-__global__ void backprop1_kernel(
-  El::Int channel_height,
-  El::Int width,
+/** Compute gradients w.r.t. statistics and affine transform.
+ *
+ *  dL/dscale = sum(dL/dy_i * xhat_i)
+ *
+ *  dL/dbias = sum(dL/dy_i)
+ *
+ *  dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon)
+ *
+ *  dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2
+ *
+ *  On input, means_grad and vars_grad are filled with zeros.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (channel_size / bsize) x num_channels x 1
+ */
+template <typename TensorDataType, int bdimx>
+__global__ void bp_statistics_grad_kernel(
+  int mini_batch_size,
+  int num_channels,
+  int channel_size,
   const TensorDataType * __restrict__ global_input,
-  El::Int input_ldim,
+  int input_ldim,
   const TensorDataType * __restrict__ global_gradient_wrt_output,
-  El::Int gradient_wrt_output_ldim,
+  int gradient_wrt_output_ldim,
   const TensorDataType * __restrict__ global_mean,
   const TensorDataType * __restrict__ global_var,
   TensorDataType epsilon,
@@ -180,82 +228,82 @@ __global__ void backprop1_kernel(
         TensorDataType * __restrict__ global_dmean,
         TensorDataType * __restrict__ global_dvar) {
 
-  // Indices
-  const El::Int tid = threadIdx.x;
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-
-  // Initialize shared memory
-  __shared__ TensorDataType shared_dscale[block_size];
-  __shared__ TensorDataType shared_dbias[block_size];
-  __shared__ TensorDataType shared_dmean[block_size];
-  __shared__ TensorDataType shared_dvar[block_size];
-
-  // Copy batch normalization parameters to private memory
-  const auto& mean = global_mean[bidy];
-  const auto& var = global_var[bidy];
-  const auto& scale = global_scale[bidy];
-
-  // Compute useful constants
-  const TensorDataType zero = TensorDataType(0);
-  const auto& inv_stdev = cuda::rsqrt(var + epsilon);
-  const auto& dvar_factor = inv_stdev * inv_stdev * inv_stdev / TensorDataType(2);
-
-  // Compute row-wise gradient contributions in shared memory
-  auto dscale = zero;
-  auto dbias = zero;
-  auto dmean = zero;
-  auto dvar = zero;
-  if (gidx < channel_height) {
-    const auto& row = gidx + bidy * channel_height;
-    for(El::Int col = 0; col < width; ++col) {
-      const auto& x = global_input[row + col * input_ldim];
-      const auto& xhat = (x - mean) * inv_stdev;
-      const auto& dy = global_gradient_wrt_output[row + col * gradient_wrt_output_ldim];
-      dscale += dy * xhat;
-      dbias += dy;
-      const auto& dxhat = dy * scale;
-      dmean += - dxhat * inv_stdev;
-      dvar += - dxhat * (x - mean) * dvar_factor;
+  // Indices and dimensions
+  constexpr int bdimy = 1;
+  constexpr int bdimz = 1;
+  const auto& tid = threadIdx.x;
+  const auto& gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const auto& gidy = blockIdx.y;
+  const auto& nthreadsx = blockDim.x * gridDim.x;
+  const auto& nthreadsy = gridDim.y;
+
+  for (int channel = gidy; channel < num_channels; channel += nthreadsy) {
+
+    // Copy batch normalization parameters to private memory
+    const auto& mean = global_mean[channel];
+    const auto& var = global_var[channel];
+    const auto& scale = global_scale[channel];
+
+    // Compute useful constants
+    const auto& inv_stdev = cuda::rsqrt(var + epsilon);
+    const auto& dvar_factor = inv_stdev * inv_stdev * inv_stdev * TensorDataType(0.5);
+
+    // Accumulate sums and perform block-wide reduction
+    using array_t = cuda::array<TensorDataType,4>;
+    using array_sum_t = array_sum<TensorDataType,4>;
+    array_t sums;
+    sums[0] = TensorDataType(0);
+    sums[1] = TensorDataType(0);
+    sums[2] = TensorDataType(0);
+    sums[3] = TensorDataType(0);
+    for (int i = gidx; i < channel_size; i += nthreadsx) {
+      for (int j = 0; j < mini_batch_size; ++j) {
+        const auto& x = global_input[i + channel*channel_size + j*input_ldim];
+        const auto& xhat = (x - mean) * inv_stdev;
+        const auto& dy = global_gradient_wrt_output[i
+                                                    + channel*channel_size
+                                                    + j*gradient_wrt_output_ldim];
+        sums[0] += dy * xhat;
+        sums[1] += dy;
+        const auto& dxhat = dy * scale;
+        sums[2] -= dxhat * inv_stdev;
+        sums[3] -= dxhat * (x - mean) * dvar_factor;
+      }
     }
-  }
-  shared_dscale[tid] = dscale;
-  shared_dbias[tid] = dbias;
-  shared_dmean[tid] = dmean;
-  shared_dvar[tid] = dvar;
-
-  // Compute gradients with shared memory reduction
-  // @todo unroll loops
-  for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
-    __syncthreads();
-    if (tid < stride) {
-      shared_dscale[tid] += shared_dscale[tid + stride];
-      shared_dbias[tid] += shared_dbias[tid + stride];
-      shared_dmean[tid] += shared_dmean[tid + stride];
-      shared_dvar[tid] += shared_dvar[tid + stride];
+    sums = cuda::block_reduce<bdimx,bdimy,bdimz,array_t,array_sum_t>(sums);
+
+    // Output result to global memory
+    if (tid == 0) {
+      cuda::atomic_add(&global_dscale[channel], sums[0]);
+      cuda::atomic_add(&global_dbias[channel], sums[1]);
+      cuda::atomic_add(&global_dmean[channel], sums[2]);
+      cuda::atomic_add(&global_dvar[channel], sums[3]);
     }
-  }
 
-  // Output channel sum to global memory
-  if (tid == 0) {
-    cuda::atomic_add(&global_dscale[bidy], shared_dscale[0]);
-    cuda::atomic_add(&global_dbias[bidy], shared_dbias[0]);
-    cuda::atomic_add(&global_dmean[bidy], shared_dmean[0]);
-    cuda::atomic_add(&global_dvar[bidy], shared_dvar[0]);
   }
 
 }
 
-/** CUDA kernel to compute gradients w.r.t. input. */
-template <El::Int block_size, typename TensorDataType>
-__global__ void backprop2_kernel(
-  El::Int channel_height,
-  El::Int local_width,
-  El::Int num_per_sum,
+/** Compute gradients w.r.t. input.
+ *
+ *  dL/dx_i = ( dL/dxhat_i / sqrt(var+epsilon)
+ *              + dL/dmean / n
+ *              + dL/dvar * (x_i - mean) * 2/(n-1) )
+ *
+ *  Block dimensions: bdimx x bdimy x bdimz
+ *
+ *  Grid dimensions: (channel_size / bdimx) x (mini_batch_size / bdimy) x (num_channels / bdimz)
+ */
+template <typename TensorDataType>
+__global__ void bp_input_grad_kernel(
+  int mini_batch_size,
+  int num_channels,
+  int channel_size,
+  int num_per_sum,
   const TensorDataType * __restrict__ global_input,
-  El::Int input_ldim,
+  int input_ldim,
   const TensorDataType * __restrict__ global_gradient_wrt_output,
-  El::Int gradient_wrt_output_ldim,
+  int gradient_wrt_output_ldim,
   const TensorDataType * __restrict__ global_mean,
   const TensorDataType * __restrict__ global_var,
   TensorDataType epsilon,
@@ -263,33 +311,33 @@ __global__ void backprop2_kernel(
   const TensorDataType * __restrict__ global_dmean,
   const TensorDataType * __restrict__ global_dvar,
         TensorDataType * __restrict__ global_gradient_wrt_input,
-  El::Int gradient_wrt_input_ldim) {
-
-  // Indices
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-
-  // Copy batch normalization parameters to private memory
-  const auto& mean = global_mean[bidy];
-  const auto& var = global_var[bidy];
-  const auto& scale = global_scale[bidy];
-  const auto& dmean = global_dmean[bidy];
-  const auto& dvar = global_dvar[bidy];
-
-  // Compute useful constants
-  const auto& inv_stdev = cuda::rsqrt(var + epsilon);
-  const auto& dmean_term = dmean / TensorDataType(num_per_sum);
-  const auto& dvar_term = dvar * TensorDataType(2) / TensorDataType(num_per_sum - 1);
-
-  // Apply batch normalization
-  if (gidx < channel_height) {
-    const auto& row = gidx + bidy * channel_height;
-    for (El::Int col = 0; col < local_width; ++col) {
-      const auto& x = global_input[row + col * input_ldim];
-      const auto& dy = global_gradient_wrt_output[row + col * gradient_wrt_output_ldim];
-      const auto& dxhat = dy * scale;
-      auto& dx = global_gradient_wrt_input[row + col * gradient_wrt_input_ldim];
-      dx = dxhat * inv_stdev + dmean_term + dvar_term * (x - mean);
+  int gradient_wrt_input_ldim) {
+
+  // Indices and dimensions
+  const auto& gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const auto& gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const auto& gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const auto& nthreadsx = blockDim.x * gridDim.x;
+  const auto& nthreadsy = blockDim.y * gridDim.y;
+  const auto& nthreadsz = blockDim.z * gridDim.z;
+
+  for (auto k = gidz; k < num_channels; k += nthreadsz) {
+    const auto& mean = global_mean[k];
+    const auto& var = global_var[k];
+    const auto& inv_stdev = cuda::rsqrt(var + epsilon);
+    const auto& scale = global_scale[k];
+    const auto& dmean = global_dmean[k];
+    const auto& dvar = global_dvar[k];
+    const auto& dmean_term = dmean / TensorDataType(num_per_sum);
+    const auto& dvar_term = dvar * TensorDataType(2) / TensorDataType(num_per_sum - 1);
+    for (auto j = gidy; j < mini_batch_size; j += nthreadsy) {
+      for (auto i = gidx; i < channel_size; i += nthreadsx) {
+        const auto& x = global_input[i + k*channel_size + j*input_ldim];
+        const auto& dy = global_gradient_wrt_output[i + k*channel_size + j*gradient_wrt_output_ldim];
+        const auto& dxhat = dy * scale;
+        auto& dx = global_gradient_wrt_input[i + k*channel_size + j*gradient_wrt_input_ldim];
+        dx = dxhat * inv_stdev + dmean_term + dvar_term * (x - mean);
+      }
     }
   }
 
@@ -436,18 +484,21 @@ void batch_normalization_layer<TensorDataType, T_layout, Dev>::fp_compute() {
     El::Zero(local_mean);
     El::Zero(local_var);
     if (!local_input.IsEmpty()) {
-      const El::Int block_size = 256;
+      constexpr int block_size = 256;
       dim3 block_dims, grid_dims;
       block_dims.x = block_size;
       grid_dims.x = (channel_size + block_size - 1) / block_size;
-      grid_dims.y = num_channels;
-      channel_sums_kernel<block_size>
+      grid_dims.y = El::Min(num_channels, 65535);
+      fp_sums_kernel<TensorDataType, block_size>
         <<<grid_dims, block_dims, 0, stream>>>(
-          channel_size, local_width,
+          local_width,
+          num_channels,
+          channel_size,
           local_input.LockedBuffer(), local_input.LDim(),
-          local_mean.Buffer(), local_var.Buffer());
+          local_mean.Buffer(),
+          local_var.Buffer());
     }
-    El::Int num_per_sum;
+    int num_per_sum;
     if (this->m_statistics_group_size == 0) {
       // Global statistics aggregation; allreduce on fused buffer.
       this->m_comm->allreduce(*this->m_mean_and_var, this->m_mean_and_var->RedundantComm(),
@@ -475,9 +526,10 @@ void batch_normalization_layer<TensorDataType, T_layout, Dev>::fp_compute() {
     if (num_per_sum <= 1) {
       El::Fill(local_var, TensorDataType(1.0));
     } else if (num_channels > 0) {
-      const El::Int block_dim = 256;
-      const El::Int grid_dim = (num_channels + block_dim - 1) / block_dim;
-      compute_statistics_kernel<<<grid_dim, block_dim, 0, stream>>>(
+      constexpr size_t block_dim = 256;
+      const size_t grid_dim = El::Min((num_channels + block_dim - 1) / block_dim,
+                                      65535);
+      fp_statistics_kernel<<<grid_dim, block_dim, 0, stream>>>(
           num_channels, num_per_sum, this->m_epsilon, this->m_decay,
           local_mean.Buffer(), local_var.Buffer(),
           local_running_mean.Buffer(), local_running_var.Buffer());
@@ -495,14 +547,15 @@ void batch_normalization_layer<TensorDataType, T_layout, Dev>::fp_compute() {
                            this->m_var_v->LockedMatrix() :
                            this->weights_values(3).LockedMatrix());
   if (!local_input.IsEmpty()) {
-    const El::Int block_size = 256;
+    constexpr int block_size = 256;
     dim3 block_dims, grid_dims;
     block_dims.x = block_size;
     grid_dims.x = (channel_size + block_size - 1) / block_size;
-    grid_dims.y = num_channels;
-    batch_normalization_kernel<block_size>
+    grid_dims.y = El::Min(local_width, 65535);
+    grid_dims.z = El::Min(num_channels, 65535);
+    fp_output_kernel
       <<<grid_dims, block_dims, 0, stream>>>(
-        channel_size, local_width,
+        local_width, num_channels, channel_size,
         local_input.LockedBuffer(), local_input.LDim(),
         local_mean.LockedBuffer(), local_var.LockedBuffer(), this->m_epsilon,
         local_scale.LockedBuffer(), local_bias.LockedBuffer(),
@@ -557,14 +610,14 @@ void batch_normalization_layer<TensorDataType, T_layout, Dev>::bp_compute() {
   El::Zero(local_mean_gradient);
   El::Zero(local_var_gradient);
   if (!local_input.IsEmpty()) {
-    const El::Int block_size = 256;
+    constexpr int block_size = 256;
     dim3 block_dims, grid_dims;
     block_dims.x = block_size;
     grid_dims.x = (channel_size + block_size - 1) / block_size;
-    grid_dims.y = num_channels;
-    backprop1_kernel<block_size>
+    grid_dims.y = El::Min(num_channels, 65535);
+    bp_statistics_grad_kernel<TensorDataType,block_size>
       <<<grid_dims, block_dims, 0, stream>>>(
-        channel_size, local_width,
+        local_width, num_channels, channel_size,
         local_input.LockedBuffer(), local_input.LDim(),
         local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
         local_mean.LockedBuffer(), local_var.LockedBuffer(), this->m_epsilon,
@@ -600,7 +653,7 @@ void batch_normalization_layer<TensorDataType, T_layout, Dev>::bp_compute() {
   }
 
   // Compute error signal
-  El::Int num_per_sum;
+  int num_per_sum;
   if (this->m_statistics_group_size == 0) {
     // Global statistics aggregation.
     num_per_sum = channel_size * width;
@@ -614,14 +667,15 @@ void batch_normalization_layer<TensorDataType, T_layout, Dev>::bp_compute() {
   if (num_per_sum <= 1) {
     El::Zero(local_gradient_wrt_input);
   } else if (!local_input.IsEmpty()) {
-    const El::Int block_size = 256;
+    constexpr int block_size = 256;
     dim3 block_dims, grid_dims;
     block_dims.x = block_size;
     grid_dims.x = (channel_size + block_size - 1) / block_size;
-    grid_dims.y = num_channels;
-    backprop2_kernel<block_size>
+    grid_dims.y = El::Min(local_width, 65535);
+    grid_dims.z = El::Min(num_channels, 65535);
+    bp_input_grad_kernel
       <<<grid_dims, block_dims, 0, stream>>>(
-        channel_size, local_width, num_per_sum,
+        local_width, num_channels, channel_size, num_per_sum,
         local_input.LockedBuffer(), local_input.LDim(),
         local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
         local_mean.LockedBuffer(), local_var.LockedBuffer(), this->m_epsilon,

From e9eafdb031be7fb3e7f4ae5af9cd24dbfc1b5acb Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Tue, 1 Sep 2020 17:16:28 -0700
Subject: [PATCH 17/36] Feature add olcf center (#1598)

* Added spack environments for OLCF and Summit.  Updated build scripts
to detect the OLCF systems via fully-qualified domain name.

* Setting compiler to gcc 8.1.1 and CUDA to 11.0.2

* Adding the OLCF center to the python front end

* Add option to control the number of parallel build jobs.  Add limit
for OLCF systems to avoid build processes being killed by the system.

* Updated spack environment to reflect Spack yaml changes for v0.15.4+.

* Remove the explicit dependency on cub since it is inherited from hydrogen.

* Switched back to using GCC 7.4.0 and CUDA 10.x.

* Put in a proper root path for OLCF and exceptions for data sets that
don't exist.

* Updated gcc versions for modules
---
 python/lbann/contrib/launcher.py              |  18 +++
 python/lbann/contrib/olcf/__init__.py         |   0
 python/lbann/contrib/olcf/launcher.py         | 102 ++++++++++++++++
 python/lbann/contrib/olcf/paths.py            | 110 ++++++++++++++++++
 python/lbann/contrib/olcf/systems.py          |  69 +++++++++++
 scripts/build_lbann_from_source.sh            |  30 ++++-
 scripts/install_lbann.sh                      |  13 ++-
 spack_environments/olcf/compilers.sh          |  41 +++++++
 .../olcf/externals-linux-rhel7-power9le.sh    |  96 +++++++++++++++
 .../std_versions_and_variants.sh              |   4 +
 10 files changed, 475 insertions(+), 8 deletions(-)
 create mode 100644 python/lbann/contrib/olcf/__init__.py
 create mode 100644 python/lbann/contrib/olcf/launcher.py
 create mode 100644 python/lbann/contrib/olcf/paths.py
 create mode 100644 python/lbann/contrib/olcf/systems.py
 create mode 100644 spack_environments/olcf/compilers.sh
 create mode 100644 spack_environments/olcf/externals-linux-rhel7-power9le.sh

diff --git a/python/lbann/contrib/launcher.py b/python/lbann/contrib/launcher.py
index 775e79a7408..b66f80fd051 100644
--- a/python/lbann/contrib/launcher.py
+++ b/python/lbann/contrib/launcher.py
@@ -30,6 +30,20 @@ def is_nersc_center():
     """
     return bool(os.getenv('NERSC_HOST'))
 
+def is_olcf_center():
+    """Current system is operated by the Oak Ridge Leadership
+    Computing Facility at Oak Ridge National Laboratory.
+
+    Checks whether the domain name ends with ".ornl.gov".
+    Checks whether the environment variable OLCF_MODULEPATH_ROOT is set.
+
+    """
+    domain = socket.getfqdn().split('.')
+    return (len(domain) > 2
+            and domain[-2] == 'ornl'
+            and domain[-1] == 'gov')
+#    return bool(os.getenv('OLCF_MODULEPATH_ROOT'))
+
 # Detect compute center and choose launcher
 _center = 'unknown'
 launcher = lbann.launcher
@@ -41,6 +55,10 @@ def is_nersc_center():
     _center = 'nersc'
     import lbann.contrib.nersc.launcher
     launcher = lbann.contrib.nersc.launcher
+elif is_olcf_center():
+    _center = 'olcf'
+    import lbann.contrib.olcf.launcher
+    launcher = lbann.contrib.olcf.launcher
 
 def compute_center():
     """Name of organization that operates current system."""
diff --git a/python/lbann/contrib/olcf/__init__.py b/python/lbann/contrib/olcf/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/lbann/contrib/olcf/launcher.py b/python/lbann/contrib/olcf/launcher.py
new file mode 100644
index 00000000000..75ea02aff53
--- /dev/null
+++ b/python/lbann/contrib/olcf/launcher.py
@@ -0,0 +1,102 @@
+import os
+from lbann.contrib.olcf.systems import *
+import lbann.launcher
+from lbann.util import make_iterable
+
+def run(*args, **kwargs):
+    """Run LBANN with OLCF-specific optimizations (deprecated).
+
+    This is deprecated. Use `lbann.contrib.launcher.run` instead.
+
+    """
+
+    import warnings
+    warnings.warn(
+        'Using deprecated function `lbann.contrib.olcf.launcher.run`. '
+        'Use `lbann.contrib.launcher.run` instead.'
+    )
+    from ..launcher import run as _run
+    _run(*args, **kwargs)
+
+def make_batch_script(
+    system=system(),
+    procs_per_node=procs_per_node(),
+    scheduler=scheduler(),
+    launcher_args=[],
+    environment={},
+    *args,
+    **kwargs,
+):
+    """Construct batch script manager with OLCF-specific optimizations.
+
+    This is a wrapper around `lbann.launcher.make_batch_script`, with
+    defaults and optimizations for LC systems. See that function for a
+    full list of options.
+
+    """
+
+    # Create shallow copies of input arguments
+    launcher_args = list(make_iterable(launcher_args))
+    environment = environment.copy()
+
+    # Helper function to configure environment variables
+    # Note: User-provided values take precedence, followed by values
+    # in the environment, followed by default values.
+    def set_environment(key, default):
+        if key not in environment:
+            environment[key] = os.getenv(key, default)
+
+    # Setup GPU bindings
+    # Note: Each Hydrogen process is assigned to the GPU index that
+    # matches its node communicator rank. This is not compatible with
+    # mpibind, which assigns a GPU with index 0 to each process. We
+    # can't use an exclusive GPU compute mode since processes may
+    # touch the wrong GPU while figuring out ownership.
+    if scheduler == 'slurm' and has_gpu(system):
+        launcher_args.extend(['--mpibind=off',
+                              '--nvidia_compute_mode=default'])
+
+    # Optimizations for Summit-like systems
+    if system in ('summit'):
+
+        # Set thread affinity
+        # Note: Aluminum's default thread affinity is incorrect since
+        # hwloc treats GPUs as NUMA domains.
+        # Note: There are actually 22 cores/socket, but it seems that
+        # powers of 2 are better for performance.
+        cores_per_socket = 16
+        procs_per_socket = (procs_per_node + 1) // 2
+        cores_per_proc = cores_per_socket // procs_per_socket
+        set_environment('AL_PROGRESS_RANKS_PER_NUMA_NODE', procs_per_socket)
+        set_environment('OMP_NUM_THREADS', cores_per_proc)
+        if scheduler == 'lsf':
+            launcher_args.append('--bind packed:{}'.format(cores_per_proc))
+
+        # Hack to enable process forking
+        # Note: InfiniBand is known to experience hangs if an MPI
+        # process is forked (see
+        # https://www.open-mpi.org/faq/?category=openfabrics#ofa-fork).
+        # Setting IBV_FORK_SAFE seems to fix this issue, but it may
+        # hurt performance (see
+        # https://linux.die.net/man/3/ibv_fork_init).
+        set_environment('IBV_FORK_SAFE', 1)
+
+        # Hacked bugfix for hcoll (1/23/19)
+        # Note: Fixes hangs in MPI_Bcast.
+        set_environment('HCOLL_ENABLE_SHARP', 0)
+        set_environment('OMPI_MCA_coll_hcoll_enable', 0)
+
+        # Hacked bugfix for Spectrum MPI PAMI (9/17/19)
+        set_environment('PAMI_MAX_NUM_CACHED_PAGES', 0)
+
+        # Configure NVSHMEM to load Spectrum MPI
+        set_environment('NVSHMEM_MPI_LIB_NAME', 'libmpi_ibm.so')
+
+    return lbann.launcher.make_batch_script(
+        procs_per_node=procs_per_node,
+        scheduler=scheduler,
+        launcher_args=launcher_args,
+        environment=environment,
+        *args,
+        **kwargs,
+    )
diff --git a/python/lbann/contrib/olcf/paths.py b/python/lbann/contrib/olcf/paths.py
new file mode 100644
index 00000000000..2974dc3aa4a
--- /dev/null
+++ b/python/lbann/contrib/olcf/paths.py
@@ -0,0 +1,110 @@
+"""Useful file paths on OLCF systems."""
+import os.path
+from lbann.contrib.lc.systems import system
+
+# ==============================================
+# Data sets
+# ==============================================
+
+def parallel_file_system_path(system = system()):
+    """Base path to parallel file system."""
+    if system in ('summit'):
+        return '/ccs/proj/ast153/'
+    else:
+        return '/ccs/proj/ast153/'
+
+def mnist_dir(system = system()):
+    """MNIST directory on OLCF system.
+
+    The directory contains four files: train-images-idx3-ubyte,
+    train-labels-idx1-ubyte, t10k-images-idx3-ubyte,
+    t10k-labels-idx1-ubyte. These files can be obtained by downloading
+    from http://yann.lecun.com/exdb/mnist/ and uncompressing.
+
+    """
+    raise AssertionError("Unimplemented data set")
+    return parallel_file_system_path(system) + '/datasets/MNIST'
+
+def cifar10_dir(system = system()):
+    """CIFAR10 directory on OLCF systems."""
+    raise AssertionError("Unimplemented data set")
+    return parallel_file_system_path(system) + '/datasets/cifar10-bin'
+
+def imagenet_dir(system = system(), data_set = 'training',
+                 num_classes = 1000):
+    """ImageNet directory on OLCF system.
+
+    The directory contains JPEG images from the ILSVRC2012
+    competition. File names in the label file are relative to this
+    directory. The images can be obtained from
+    http://image-net.org/challenges/LSVRC/2012/.
+
+    There are three available data sets: 'training', 'validation', and
+    'testing'.
+
+    Some of these data sets have been preprocessed to only include
+    images in a subset of the label classes, e.g. images in the first
+    10 label classes. This is convenient for quickly evaluating
+    performance or learning behavior. The availabiilty of these
+    subsampled data sets may vary by system.
+
+    """
+    raise AssertionError("Unimplemented data set")
+    base_path = parallel_file_system_path(system)
+    base_path += 'datasets/ILSVRC2012/original/'
+    if data_set.lower() in ('train', 'training'):
+        return base_path + 'train/'
+    elif data_set.lower() in ('val', 'validation'):
+        return base_path + 'val/'
+    elif data_set.lower() in ('test', 'testing'):
+        return base_path + 'test/'
+    else:
+        raise RuntimeError('unknown ImageNet data set (' + data_set + ')')
+
+def imagenet_labels(system = system(), data_set = 'train',
+                    num_classes = 1000):
+    """ImageNet label file on OLCF system.
+
+    The file contains ground truth labels from the ILSVRC2012
+    competition. It is a plain text file where each line contains an
+    image file path (relative to the ImageNet directory; see the
+    `imagenet_dir` function) and the corresponding label ID.
+
+    There are three available data sets: 'training', 'validation', and
+    'testing'.
+
+    Some of these data sets have been preprocessed to only include
+    images in a subset of the label classes, e.g. images in the first
+    10 label classes. This is convenient for quickly evaluating
+    performance or learning behavior. The availabiilty of these
+    subsampled data sets may vary by system.
+
+    """
+    raise AssertionError("Unimplemented data set")
+    label_dir = parallel_file_system_path(system)
+    if system in ('lassen', 'sierra'):
+        label_dir += 'datasets/ILSVRC2012/original/labels/'
+    else:
+        label_dir += 'datasets/ILSVRC2012/labels/'
+    suffixes = {1000: '', 10: '_c0-9', 100: '_c0-99',
+                200: '_c100-299', 300: '_c0-299'}
+    if data_set.lower() in ('train', 'training'):
+        if num_classes in suffixes.keys():
+            return os.path.join(label_dir,
+                                'train' + suffixes[num_classes] + '.txt')
+        else:
+            raise RuntimeError('invalid number of classes ({0}) '
+                               'for ImageNet data set ({1})'
+                               .format(num_classes, data_set))
+    elif data_set.lower() in ('val', 'validation'):
+        if num_classes in suffixes.keys():
+            return os.path.join(label_dir,
+                                'val' + suffixes[num_classes] + '.txt')
+        else:
+            raise RuntimeError('invalid number of classes ({0}) '
+                               'for ImageNet data set ({1})'
+                               .format(num_classes, data_set))
+    elif data_set.lower() in ('test', 'testing'):
+        return os.path.join(label_dir, 'test.txt')
+    else:
+        raise RuntimeError('unknown ImageNet data set (' + data_set + ')')
diff --git a/python/lbann/contrib/olcf/systems.py b/python/lbann/contrib/olcf/systems.py
new file mode 100644
index 00000000000..9a4fc408c63
--- /dev/null
+++ b/python/lbann/contrib/olcf/systems.py
@@ -0,0 +1,69 @@
+"""Default settings for OLCF systems."""
+import socket
+import re
+
+# ==============================================
+# Set system parameters
+# ==============================================
+
+class SystemParams:
+    """Simple data structure to describe an OLCF system."""
+    def __init__(self, cores_per_node, gpus_per_node, scheduler):
+        self.cores_per_node = cores_per_node
+        self.gpus_per_node = gpus_per_node
+        self.scheduler = scheduler
+
+# Supported LC systems
+_system_params = {
+    'summit':   SystemParams(44, 6, 'lsf'),
+}
+
+# Detect system
+_system = re.sub(r'\d+', '', socket.gethostname())
+if _system not in _system_params.keys():
+    _system = None
+
+# ==============================================
+# Access functions
+# ==============================================
+
+def system():
+    """Name of OLCF system."""
+    if _system:
+        return _system
+    else:
+        raise RuntimeError('unknown system '
+                           '(' + socket.gethostname() + ')')
+
+def is_olcf_system(system = system()):
+    """Whether current system is a supported OLCF system."""
+    return (system is not None) and (system in _system_params.keys())
+
+def gpus_per_node(system = system()):
+    """Number of GPUs per node."""
+    if not is_olcf_system(system):
+        raise RuntimeError('unknown system (' + system + ')')
+    return _system_params[system].gpus_per_node
+
+def has_gpu(system = system()):
+    """Whether OLCF system has GPUs."""
+    return gpus_per_node(system) > 0
+
+def cores_per_node(system = system()):
+    """Number of CPU cores per node."""
+    if not is_olcf_system(system):
+        raise RuntimeError('unknown system (' + system + ')')
+    return _system_params[system].cores_per_node
+
+def scheduler(system = system()):
+    """Job scheduler for OLCF system."""
+    if not is_olcf_system(system):
+        raise RuntimeError('unknown system (' + system + ')')
+    return _system_params[system].scheduler
+
+def procs_per_node(system = system()):
+    """Default number of processes per node."""
+    if has_gpu(system):
+        return gpus_per_node(system)
+    else:
+        raise RuntimeError('unknown system (' + system + ')')
diff --git a/scripts/build_lbann_from_source.sh b/scripts/build_lbann_from_source.sh
index 14e9d0e6c66..ec97d078a70 100755
--- a/scripts/build_lbann_from_source.sh
+++ b/scripts/build_lbann_from_source.sh
@@ -34,6 +34,7 @@ fi
 
 LBANN_HOME=$(dirname ${SCRIPTS_DIR})
 SPACK_ENV_DIR=${LBANN_HOME}/spack_environments
+NINJA_NUM_PROCESSES=0 # Let ninja decide
 
 # Identify the center that we are running at
 CENTER=
@@ -46,11 +47,19 @@ if [[ ${SYS} = "Darwin" ]]; then
     BUILD_SUFFIX=llnl.gov
 else
     CORI=$([[ $(hostname) =~ (cori|cgpu) ]] && echo 1 || echo 0)
+    DOMAINNAME=$(python -c 'import socket; domain = socket.getfqdn().split("."); print(domain[-2] + "." + domain[-1])')
     if [[ ${CORI} -eq 1 ]]; then
         CENTER="nersc"
         # Make sure to purge and setup the modules properly prior to finding the Spack architecture
         source ${SPACK_ENV_DIR}/${CENTER}/setup_modules.sh
         BUILD_SUFFIX=nersc.gov
+    elif [[ ${DOMAINNAME} = "ornl.gov" ]]; then
+        CENTER="olcf"
+        BUILD_SUFFIX=${DOMAINNAME}
+        NINJA_NUM_PROCESSES=16 # Don't let OLCF kill build jobs
+    elif [[ ${DOMAINNAME} = "llnl.gov" ]]; then
+        CENTER="llnl_lc"
+        BUILD_SUFFIX=${DOMAINNAME}
     else
         CENTER="llnl_lc"
         BUILD_SUFFIX=llnl.gov
@@ -102,6 +111,7 @@ Options:
   ${C}--instrument${N}         Use -finstrument-functions flag, for profiling stack traces
   ${C}-s | --superbuild${N}    Superbuild LBANN with hydrogen and aluminum
   ${C}-c | --distconv${N}      Enable the DistConv library
+  ${C}--ninja-processes${N} <val> Number of parallel processes for ninja.
 EOF
 }
 
@@ -208,6 +218,15 @@ while :; do
             # MPI-CUDA backend is required for Distconv
             ALUMINUM_WITH_MPI_CUDA=ON
             ;;
+        --ninja-processes)
+            if [ -n "${2}" ]; then
+                NINJA_NUM_PROCESSES=${2}
+                shift
+            else
+                echo "\"${1}\" option requires a non-empty option argument" >&2
+                exit 1
+            fi
+            ;;
         -?*)
             # Unknown option
             echo "Unknown option (${1})" >&2
@@ -266,9 +285,16 @@ fi
 
 source ${SPACK_ENV_DIR}/${SUPERBUILD}
 
-ninja install
+if [ ${NINJA_NUM_PROCESSES} -ne 0 ]; then
+    BUILD_COMMAND="ninja -j${NINJA_NUM_PROCESSES}"
+else
+    # Usually equivalent to -j<num_cpus+2>
+    BUILD_COMMAND="ninja"
+fi
+
+${BUILD_COMMAND} install
 
 echo "To rebuild the environment:"
 echo "    ${SPACK_ENV_CMD}"
 echo "    cd ${LBANN_BUILD_DIR}"
-echo "    ninja install"
+echo "    ${BUILD_COMMAND} install"
diff --git a/scripts/install_lbann.sh b/scripts/install_lbann.sh
index 9659661903b..670a71332c5 100755
--- a/scripts/install_lbann.sh
+++ b/scripts/install_lbann.sh
@@ -41,10 +41,15 @@ if [[ ${SYS} = "Darwin" ]]; then
     CENTER="osx"
 else
     CORI=$([[ $(hostname) =~ (cori|cgpu) ]] && echo 1 || echo 0)
+    DOMAINNAME=$(python -c 'import socket; domain = socket.getfqdn().split("."); print(domain[-2] + "." + domain[-1])')
     if [[ ${CORI} -eq 1 ]]; then
         CENTER="nersc"
         # Make sure to purge and setup the modules properly prior to finding the Spack architecture
         source ${SPACK_ENV_DIR}/${CENTER}/setup_modules.sh
+    elif [[ ${DOMAINNAME} = "ornl.gov" ]]; then
+        CENTER="olcf"
+    elif [[ ${DOMAINNAME} = "llnl.gov" ]]; then
+        CENTER="llnl_lc"
     else
         CENTER="llnl_lc"
     fi
@@ -158,12 +163,14 @@ DIHYDROGEN_VARIANTS="variants: +shared +al +openmp ${HALF_VARIANTS}"
 if [[ ${DEPS_ONLY} = "TRUE" ]]; then
     if [[ ${SYS} != "Darwin" ]]; then
         HYDROGEN_VARIANTS="${HYDROGEN_VARIANTS} +openmp_blas"
+        DIHYDROGEN_VARIANTS="${DIHYDROGEN_VARIANTS} +openmp_blas"
         COMPILER_PACKAGE=$(cat <<EOF
   - gcc
 EOF
 )
     else
         HYDROGEN_VARIANTS="${HYDROGEN_VARIANTS} blas=accelerate"
+        DIHYDROGEN_VARIANTS="${DIHYDROGEN_VARIANTS} blas=accelerate"
         COMPILER_PACKAGE=$(cat <<EOF
   - llvm
 EOF
@@ -174,7 +181,6 @@ EOF
     if [[ "${ENABLE_GPUS}" == "ON" ]]; then
         GPU_PACKAGES=$(cat <<EOF
   - cudnn
-  - cub
   - cuda
   - nccl
 EOF
@@ -251,11 +257,8 @@ ${BUILD_SPECS}
   packages:
 ${EXTERNAL_ALL_PACKAGES}
 ${COMPILER_ALL_PACKAGES}
-
 ${EXTERNAL_PACKAGES}
-
 ${STD_PACKAGES}
-
     aluminum:
       buildable: true
       version:
@@ -280,9 +283,7 @@ ${STD_PACKAGES}
       providers: {}
       compiler: []
       target: []
-
 ${COMPILER_DEFINITIONS}
-
 ${STD_MODULES}
   view: true
 EOF
diff --git a/spack_environments/olcf/compilers.sh b/spack_environments/olcf/compilers.sh
new file mode 100644
index 00000000000..a66d449a4a4
--- /dev/null
+++ b/spack_environments/olcf/compilers.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+COMPILER_ALL_PACKAGES=$(cat <<EOF
+      compiler:
+      - gcc@7.4.0 arch=linux-rhel7-power9le
+      - gcc@8.1.1 arch=linux-rhel7-power9le
+EOF
+)
+
+COMPILER_DEFINITIONS=$(cat <<EOF
+  compilers:
+  - compiler:
+      environment: 
+        unset: []
+      extra_rpaths: []
+      flags: {}
+      modules: []
+      operating_system: rhel7
+      paths:
+        cc: /sw/summit/gcc/7.4.0/bin/gcc
+        cxx: /sw/summit/gcc/7.4.0/bin/g++
+        f77: /sw/summit/gcc/7.4.0/bin/gfortran
+        fc: /sw/summit/gcc/7.4.0/bin/gfortran
+      spec: gcc@7.4.0
+      target: ppc64le
+  - compiler:
+     environment:
+       unset: []
+     extra_rpaths: []
+     flags: {}
+     modules: []
+     operating_system: rhel7
+     paths:
+       cc: /sw/summit/gcc/8.1.1/bin/gcc
+       cxx: /sw/summit/gcc/8.1.1/bin/g++
+       f77: /sw/summit/gcc/8.1.1/bin/gfortran
+       fc: /sw/summit/gcc/8.1.1/bin/gfortran
+     spec: gcc@8.1.1
+     target: ppc64le
+EOF
+)
diff --git a/spack_environments/olcf/externals-linux-rhel7-power9le.sh b/spack_environments/olcf/externals-linux-rhel7-power9le.sh
new file mode 100644
index 00000000000..d360984c1a9
--- /dev/null
+++ b/spack_environments/olcf/externals-linux-rhel7-power9le.sh
@@ -0,0 +1,96 @@
+#!/bin/sh
+
+EXTERNAL_ALL_PACKAGES=$(cat <<EOF
+    all:
+      providers:
+        mpi:
+          - spectrum-mpi@10.3.1.2-20200121 arch=linux-rhel7-power9le
+        lapack:
+          - openblas threads=openmp
+        blas:
+          - openblas threads=openmp
+      buildable: true
+      version: []
+EOF
+)
+
+EXTERNAL_PACKAGES=$(cat <<EOF
+    cmake::
+      buildable: True
+      variants: ~openssl ~ncurses
+      version:
+        - 3.18.0
+    cuda::
+      buildable: False
+      version:
+      - 10.1.243
+      target: []
+      compiler: []
+      providers: {}    
+      externals:
+      - spec: cuda@10.1.243 arch=linux-rhel7-power9le
+        modules:
+        - cuda/10.1.243
+    cudnn::
+      buildable: true
+      version:
+        - 8.0.2.39-10.1-linux-ppc64le
+    gcc::
+      buildable: False
+      version:
+      - 7.4.0
+      target: []
+      compiler: []
+      providers: {}    
+      externals:
+      - spec: gcc@7.4.0 arch=linux-rhel7-power9le
+        modules:
+        - gcc/7.4.0
+    hwloc::
+      buildable: False
+      version:
+      - 2.0.2
+      target: []
+      compiler: []
+      providers: {}    
+      externals:
+      - spec: hwloc@2.0.2 arch=linux-rhel7-power9le
+        prefix: /usr/lib64/libhwloc.so
+    openblas::
+      buildable: True
+      variants: threads=openmp ~avx2 ~avx512
+      version:
+        - 0.3.10
+    opencv::
+      buildable: true
+      variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png+powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab+vsx~vtk+zlib
+      version:
+        - 4.1.0
+    python::
+      buildable: True
+      variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
+      version:
+        - 3.7.2
+    rdma-core::
+      buildable: False
+      version:
+       - 20
+      target: []
+      compiler: []
+      providers: {}    
+      externals:
+      - spec: rdma-core@20 arch=linux-rhel7-power9le
+        prefix: /usr
+    spectrum-mpi::
+      buildable: False
+      version:
+      - 10.3.1.2-20200121
+      target: []
+      compiler: []
+      providers: {}    
+      externals:
+      - spec: spectrum-mpi@10.3.1.2-20200121 %gcc@7.4.0 arch=linux-rhel7-power9le
+        modules:
+        - spectrum-mpi/10.3.1.2-20200121
+EOF
+)
diff --git a/spack_environments/std_versions_and_variants.sh b/spack_environments/std_versions_and_variants.sh
index 2cb593f0ffa..cd276791f45 100644
--- a/spack_environments/std_versions_and_variants.sh
+++ b/spack_environments/std_versions_and_variants.sh
@@ -54,6 +54,8 @@ STD_MODULES=$(cat <<EOF
       core_compilers:
         - 'gcc@7.3.0'
         - 'gcc@7.3.1'
+        - 'gcc@7.4.0'
+        - 'gcc@8.1.1'
       projections:
         all: '\${PACKAGE}/\${VERSION}-\${COMPILERNAME}-\${COMPILERVER}'
       blacklist:
@@ -79,6 +81,8 @@ STD_MODULES=$(cat <<EOF
       core_compilers:
         - 'gcc@7.3.0'
         - 'gcc@7.3.1'
+        - 'gcc@7.4.0'
+        - 'gcc@8.1.1'
       projections:
         all: '\${PACKAGE}/\${VERSION}-\${COMPILERNAME}-\${COMPILERVER}'
       whitelist:

From 9a5f4b034b65dfb15d165c5aeb4db5cdc8721d52 Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Wed, 2 Sep 2020 09:08:02 -0700
Subject: [PATCH 18/36] Fix the cudnn version (#1612)

---
 spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh b/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
index 833877e0574..03dea96b4db 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
@@ -35,7 +35,7 @@ EXTERNAL_PACKAGES=$(cat <<EOF
     cudnn::
       buildable: true
       version:
-      - 7.6.5.32-10.1-linux-x64
+      - 7.6.5.32-10.2-linux-x64
     gcc::
       buildable: False
       version:

From cd1b4cc27434a654866e1f3d6333afc120135196 Mon Sep 17 00:00:00 2001
From: Sam Ade Jacobs <jacobs32@llnl.gov>
Date: Wed, 2 Sep 2020 11:44:41 -0700
Subject: [PATCH 19/36] ATOM VAE (#1613)

* draft implementation of ATOM VAE

* VAE draft

* VAE draft

* Add smaller (10K) dataset, and model cleanup

* Add smaller (10K) dataset, and model cleanup

* Add filename and filedir to arg

* More args added to streamline large scale experiments
---
 applications/ATOM/train_atom_vae.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/applications/ATOM/train_atom_vae.py b/applications/ATOM/train_atom_vae.py
index 61119aebe1e..afbfdbf760b 100644
--- a/applications/ATOM/train_atom_vae.py
+++ b/applications/ATOM/train_atom_vae.py
@@ -47,14 +47,19 @@ def construct_lc_launcher_args():
     parser.add_argument("--batch-size", type=int, default=512)
     parser.add_argument("--num-epochs", type=int, default=20)
     parser.add_argument("--data-reader-prototext", default=None)
+    parser.add_argument("--data-filedir", default=None)
+    parser.add_argument("--data-filename", default=None)
     parser.add_argument("--pad-index", type=int, default=None)
     parser.add_argument("--sequence-length", type=int, default=None)
-    parser.add_argument("--dump_weights_dir", type=str, default="weights")
+    parser.add_argument("--dump-weights-dir", type=str, default="weights")
+    parser.add_argument("--dump-weights-interval", type=int, default=10)
     parser.add_argument("--num-samples", type=int, default=None)
     parser.add_argument("--num-io-threads", type=int, default=11)
     parser.add_argument("--vocab", default=None)
     parser.add_argument("--delimiter", default="c")
     parser.add_argument("--no-header", type=bool, default=True)
+    parser.add_argument("--ltfb", type=bool, default=False)
+    parser.add_argument("--ltfb-batch-interval", type=int, default=100)
 
     # these are specific to the Trainer object
     parser.add_argument(
@@ -92,7 +97,7 @@ def construct_model(run_args):
     print("sequence length is {}".format(sequence_length))
     data_layout = "data_parallel"
     # Layer graph
-    input_ = lbann.Identity(lbann.Input(name='inp'), name='inp1')
+    input_ = lbann.Identity(lbann.Input(name='inp',target_mode="N/A"), name='inp1')
     vae_loss= []
     input_feature_dims = sequence_length
 
@@ -127,9 +132,11 @@ def construct_model(run_args):
 
     callbacks = [lbann.CallbackPrint(),
                  lbann.CallbackTimer(),
-                 lbann.CallbackDumpWeights(directory=run_args.dump_weights_dir, epoch_interval=10)]
-
+                 lbann.CallbackDumpWeights(directory=run_args.dump_weights_dir, epoch_interval=run_args.dump_weights_interval)]
 
+    if(run_args.ltfb):
+      callbacks.append(lbann.CallbackLTFB(batch_interval=run_args.ltfb_batch_interval,metric='recon',
+                                          low_score_wins=True,exchange_hyperparameters=True))
     # Construct model
     return lbann.Model(run_args.num_epochs,
                        weights=weights,
@@ -227,21 +234,28 @@ def main():
       import torch
       torch.save(run_args, "{}/{}_config.pt".format(experiment_dir, run_args.job_name))
 
+    m_lbann_args=f"--vocab={run_args.vocab} --data_filedir={run_args.data_filedir} --data_filename_train={run_args.data_filename} --num_samples={run_args.num_samples} --sequence_length={run_args.sequence_length}  --num_io_threads={run_args.num_io_threads} --no_header={run_args.no_header} --delimiter={run_args.delimiter}"
+    if(run_args.data_reader_prototext):
+      m_lbann_args = " ".join((m_lbann_args, " --use_data_store --preload_data_store "))
+    if(run_args.ltfb):
+      m_lbann_args = " ".join((m_lbann_args, "--ltfb"))
+
     status = lbann.contrib.launcher.run(
         trainer,
         model,
         data_reader,
         opt,
-        #partition=run_args.partition,
+        partition=run_args.partition,
         scheduler=run_args.scheduler,
         #account=run_args.account,
         time_limit=run_args.time_limit,
         nodes=run_args.nodes,
         procs_per_node=ppn,
-        batch_job = True,
+        #batch_job = True,
+        setup_only = True,
         job_name=run_args.job_name,
         experiment_dir=experiment_dir,
-        lbann_args=f"--vocab={run_args.vocab} --num_samples={run_args.num_samples} --sequence_length={run_args.sequence_length}  --num_io_threads={run_args.num_io_threads} --no_header={run_args.no_header} --delimiter={run_args.delimiter}",
+        lbann_args = m_lbann_args,
     )
 
     print("LBANN launcher status:\n" + str(status))

From b64852feba6d418fdcaf4ea3ad3829e65ab89852 Mon Sep 17 00:00:00 2001
From: Sam Ade Jacobs <jacobs32@llnl.gov>
Date: Wed, 2 Sep 2020 13:34:37 -0700
Subject: [PATCH 20/36] JAG model experiment (#1614)

* Add parser arguments to model scripts, add model evaluation script

* More cleanup

* minor refactor/clean up of scripts

* Add L2 regulation to surrogate

* Add conduit based data reader and metadata

* Add serialize io to model argument

* Add parser arguments to training and evaluation scripts and all the neat stuffs that streamline training at scale

* Add a simple script that check for zero-valued scalar in JAG samples

* Set serialize_io=True as default, addresses @vanessen's comment. Also set single view as default in metadata.

* Add model capacity factor to networks, make 3 views default in metadata

* Add model capacity factor to networks, make 3 views default in metadata

* Fix warning

* Separate mcf to wae and surrogate

* Add mcf to eval model

* Add LTFB

* Add ltfb batch interval
---
 .../physics/ICF/eval_macc_surrogate.py        | 14 +++++++---
 applications/physics/ICF/macc_models.py       | 26 ++++++++++++-------
 applications/physics/ICF/pre_train_jag_wae.py | 15 +++++++++--
 .../physics/ICF/train_macc_surrogate.py       | 22 ++++++++++++----
 .../physics/data/jag_100M_metadata.prototext  |  4 +--
 5 files changed, 58 insertions(+), 23 deletions(-)

diff --git a/applications/physics/ICF/eval_macc_surrogate.py b/applications/physics/ICF/eval_macc_surrogate.py
index 53f6665c687..cf67dcbf76b 100644
--- a/applications/physics/ICF/eval_macc_surrogate.py
+++ b/applications/physics/ICF/eval_macc_surrogate.py
@@ -16,7 +16,7 @@
 cur_dir = dirname(abspath(__file__))
 data_reader_prototext = join(dirname(cur_dir),
                              'data',
-                             'jag_conduit_reader.prototext')
+                             'eval_jag_conduit_lassen.prototext')
 metadata_prototext = join(dirname(cur_dir),
                              'data',
                              'jag_100M_metadata.prototext')
@@ -49,6 +49,12 @@
 parser.add_argument(
     '--xdim', action='store', default=5, type=int,
     help='input (x) dim (default: 5)', metavar='NUM')
+parser.add_argument(
+    '--wae_mcf', action='store', default=1, type=int,
+    help='wae model capacity factor (default: 1)', metavar='NUM')
+parser.add_argument(
+    '--surrogate_mcf', action='store', default=1, type=int,
+    help='surrogate model capacity factor (default: 1)', metavar='NUM')
 parser.add_argument(
     '--lamda-cyc', action='store', default=1e-3, type=float,
     help='lamda-cyc (default: 1e-3)', metavar='NUM')
@@ -107,9 +113,9 @@ def construct_model():
 
 
     z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20")
-    wae = macc_models.MACCWAE(args.zdim,args.ydim,use_CNN=args.useCNN) #pretrained, freeze
-    inv = macc_models.MACCInverse(args.xdim)
-    fwd = macc_models.MACCForward(args.zdim)
+    wae = macc_models.MACCWAE(args.zdim,args.ydim,cf=args.wae_mcf,use_CNN=args.useCNN) #pretrained, freeze
+    inv = macc_models.MACCInverse(args.xdim,cf=args.surrogate_mcf)
+    fwd = macc_models.MACCForward(args.zdim,cf=args.surrogate_mcf)
 
 
     y_pred_fwd = wae.encoder(gt_y)
diff --git a/applications/physics/ICF/macc_models.py b/applications/physics/ICF/macc_models.py
index bd6e9c9202d..440ba2fa746 100644
--- a/applications/physics/ICF/macc_models.py
+++ b/applications/physics/ICF/macc_models.py
@@ -7,15 +7,17 @@ class MACCForward(lbann.modules.Module):
 
     global_count = 0  # Static counter, used for default names
 
-    def __init__(self, out_dim,name=None):
+    #model capacity factor cf
+    def __init__(self, out_dim,cf=1,name=None):
        self.instance = 0
        self.name = (name if name
                      else 'macc_forward{0}'.format(MACCForward.global_count))
 
        fc = lbann.modules.FullyConnectedModule
        
+       assert isinstance(cf, int), 'model capacity factor should be an int!'
        #generator #fc2_gen0
-       g_neurons = [32,256,1024]
+       g_neurons = [x*cf for x in [32,256,1024]]
        self.gen_fc = [fc(g_neurons[i],activation=lbann.Relu, name=self.name+'gen_fc'+str(i))
                       for i in range(len(g_neurons))]
        self.predy = fc(out_dim,name=self.name+'pred_out')
@@ -27,16 +29,17 @@ def forward(self,x):
 class MACCInverse(lbann.modules.Module):
 
     global_count = 0  # Static counter, used for default names
-
-    def __init__(self, out_dim,name=None):
+    #model capacity factor cf
+    def __init__(self, out_dim,cf=1,name=None):
        self.instance = 0
        self.name = (name if name
                      else 'macc_inverse{0}'.format(MACCInverse.global_count))
 
        fc = lbann.modules.FullyConnectedModule
        
+       assert isinstance(cf, int), 'model capacity factor should be an int!'
        #generator #fc_gen1
-       g_neurons = [16,128,64]
+       g_neurons = [x*cf for x in [16,128,64]]
        self.gen_fc = [fc(g_neurons[i],activation=lbann.Relu, name=self.name+'gen_fc'+str(i))
                       for i in range(len(g_neurons))]
        self.predx = fc(out_dim,name=self.name+'pred_out')
@@ -48,8 +51,8 @@ def forward(self,y):
 class MACCWAE(lbann.modules.Module):
 
     global_count = 0  # Static counter, used for default names
-
-    def __init__(self, encoder_out_dim, decoder_out_dim, scalar_dim = 15, use_CNN=False, name=None):
+    #model capacity factor (cf) 
+    def __init__(self, encoder_out_dim, decoder_out_dim, scalar_dim = 15, cf=1, use_CNN=False, name=None):
        self.instance = 0
        self.name = (name if name
                      else 'macc_wae{0}'.format(MACCWAE.global_count))
@@ -59,10 +62,13 @@ def __init__(self, encoder_out_dim, decoder_out_dim, scalar_dim = 15, use_CNN=Fa
        fc = lbann.modules.FullyConnectedModule
        conv = lbann.modules.Convolution2dModule
 
-       disc_neurons = [128,64,1]
-       encoder_neurons = [32,256,128]
-       decoder_neurons = [64,128,256]
+       assert isinstance(cf, int), 'model capacity factor should be an int!'
 
+       disc_neurons = [128,64,1]
+       encoder_neurons = [x*cf for x in [32,256,128]]
+       decoder_neurons = [x*cf for x in [64,128,256]]
+       #Enc/Dec sizes  [32, 256, 128]   [64, 128, 256]
+       print("CF, Enc/Dec sizes ", cf, " ", encoder_neurons, " ", decoder_neurons) 
        enc_outc = [64,32,16]
        dec_outc = [32,16,4]
        
diff --git a/applications/physics/ICF/pre_train_jag_wae.py b/applications/physics/ICF/pre_train_jag_wae.py
index 997e7a26a39..cf27df5c251 100644
--- a/applications/physics/ICF/pre_train_jag_wae.py
+++ b/applications/physics/ICF/pre_train_jag_wae.py
@@ -43,6 +43,9 @@
 parser.add_argument(
     '--zdim', action='store', default=20, type=int,
     help='latent space dim (default: 20)', metavar='NUM')
+parser.add_argument(
+    '--mcf', action='store', default=1, type=int,
+    help='model capacity factor (default: 1)', metavar='NUM')
 parser.add_argument(
     '--useCNN', action='store', default=False, type=bool,
     help='use CNN', metavar='BOOL')
@@ -67,6 +70,9 @@
 parser.add_argument(
     '--procs-per-trainer', action='store', default=0, type=int,
     help='processes per trainer (default: 0)', metavar='NUM')
+parser.add_argument(
+    '--ltfb-batch-interval', action='store', default=0, type=int,
+    help='LTFB batch interval (default: 0, no LTFB)', metavar='NUM')
 args = parser.parse_args()
 
 
@@ -95,7 +101,7 @@ def construct_model():
     z_dim = 20  #Latent space dim
 
     z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20")
-    model = macc_models.MACCWAE(args.zdim,args.ydim,use_CNN=args.useCNN)
+    model = macc_models.MACCWAE(args.zdim,args.ydim,cf=args.mcf,use_CNN=args.useCNN)
     d1_real, d1_fake, d_adv, pred_y  = model(z,gt_y)
 
     d1_real_bce = lbann.SigmoidBinaryCrossEntropy([d1_real,one],name='d1_real_bce')
@@ -131,6 +137,11 @@ def construct_model():
                                       destination_layers=list2str(dst_layers),
                                       batch_interval=2)]
 
+    if(args.ltfb_batch_interval > 0) :
+      callbacks.append(lbann.CallbackLTFB(batch_interval=args.ltfb_batch_interval,metric='recon_error',
+                                    low_score_wins=True,
+                                    exchange_hyperparameters=True))
+
     # Construct model
     return lbann.Model(args.num_epochs,
                        serialize_io=True,
@@ -160,7 +171,7 @@ def construct_model():
                        nodes=args.num_nodes,
                        procs_per_node=args.ppn,
                        time_limit=720,
-                       setup_only=False,
+                       setup_only=True,
                        job_name=args.job_name,
                        lbann_args=['--use_data_store --preload_data_store',
                                    f'--metadata={metadata_prototext}',
diff --git a/applications/physics/ICF/train_macc_surrogate.py b/applications/physics/ICF/train_macc_surrogate.py
index 396465d5838..82c072c5486 100644
--- a/applications/physics/ICF/train_macc_surrogate.py
+++ b/applications/physics/ICF/train_macc_surrogate.py
@@ -52,6 +52,12 @@
 parser.add_argument(
     '--xdim', action='store', default=5, type=int,
     help='input (x) dim (default: 5)', metavar='NUM')
+parser.add_argument(
+    '--wae_mcf', action='store', default=1, type=int,
+    help='model capacity factor (default: 1)', metavar='NUM')
+parser.add_argument(
+    '--surrogate_mcf', action='store', default=1, type=int,
+    help='model capacity factor (default: 1)', metavar='NUM')
 parser.add_argument(
     '--lamda-cyc', action='store', default=1e-3, type=float,
     help='lamda-cyc (default: 1e-3)', metavar='NUM')
@@ -82,11 +88,13 @@
 parser.add_argument(
     '--procs-per-trainer', action='store', default=0, type=int,
     help='processes per trainer (default: 0)', metavar='NUM')
+parser.add_argument(
+    '--ltfb-batch-interval', action='store', default=0, type=int,
+    help='LTFB batch interval (default: 0, no LTFB)', metavar='NUM')
 args = parser.parse_args()
 
 if not(args.pretrained_dir):
-  print("WARNING pretrained dir ", args.pretrained_dir, " is empty, default option assumes
-         pretrained autoencoder")
+  print("WARNING pretrained dir ", args.pretrained_dir, " is empty, default option assumes pretrained autoencoder")
 
 def list2str(l):
     return ' '.join(l)
@@ -111,9 +119,9 @@ def construct_model():
 
 
     z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20")
-    wae = macc_models.MACCWAE(args.zdim,args.ydim,use_CNN=args.useCNN) #pretrained, freeze
-    inv = macc_models.MACCInverse(args.xdim)
-    fwd = macc_models.MACCForward(args.zdim)
+    wae = macc_models.MACCWAE(args.zdim,args.ydim,cf=args.wae_mcf,use_CNN=args.useCNN) #pretrained, freeze
+    inv = macc_models.MACCInverse(args.xdim,cf=args.surrogate_mcf)
+    fwd = macc_models.MACCForward(args.zdim,cf=args.surrogate_mcf)
 
 
     y_pred_fwd = wae.encoder(gt_y)
@@ -175,6 +183,10 @@ def construct_model():
                  lbann.CallbackLoadModel(dirs=str(args.pretrained_dir)),
                  lbann.CallbackTimer()]
 
+    if(args.ltfb_batch_interval > 0) :
+      callbacks.append(lbann.CallbackLTFB(batch_interval=args.ltfb_batch_interval,metric='fw_loss',
+                                    low_score_wins=True,
+                                    exchange_hyperparameters=True))
     # Construct model
     return lbann.Model(args.num_epochs,
                        weights=weights,
diff --git a/applications/physics/data/jag_100M_metadata.prototext b/applications/physics/data/jag_100M_metadata.prototext
index 7e22e71f0a9..de258021039 100644
--- a/applications/physics/data/jag_100M_metadata.prototext
+++ b/applications/physics/data/jag_100M_metadata.prototext
@@ -24,8 +24,8 @@ data_set_metadata {
     image_height: 64
     image_num_channels: 4
 
-    #jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] #3 views
-    jag_image_keys: ["(0.0, 0.0)/0.0/emi"] #1 view, default
+    jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] #3 views
+    #jag_image_keys: ["(0.0, 0.0)/0.0/emi"] #1 view, default
 
     scalar_prefix: "/outputs/scalars/"
 

From f1f80eb513bd2d93eb7cc607cb22dc7724868a48 Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Fri, 4 Sep 2020 09:28:45 -0700
Subject: [PATCH 21/36] Fixing the bounds checking for the smiles data reader
 and it's simple (#1615)

LTFB partitioning.
---
 src/data_readers/data_reader_smiles.cpp | 74 ++++++++++++++-----------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/src/data_readers/data_reader_smiles.cpp b/src/data_readers/data_reader_smiles.cpp
index 45ee74e1fcb..492c8ca5e54 100644
--- a/src/data_readers/data_reader_smiles.cpp
+++ b/src/data_readers/data_reader_smiles.cpp
@@ -134,11 +134,21 @@ void smiles_data_reader::load() {
 
     // Use two loops here, to assure all trainers have
     // the same number of samples
+    // ensure then number of samples is evenly divisible by
+    // the number of trainers
+    size_t n = m_shuffled_indices.size() / num_trainers;
+    size_t s3 = n*num_trainers;
+    if (m_shuffled_indices.size() != s3) {
+      if (is_master()) {
+        std::cout << "adjusting global sample size from " << m_shuffled_indices.size() << " to " << s3 << std::endl;
+      }
+      m_shuffled_indices.resize(s3);
+    }
     for (size_t j=0; j<m_shuffled_indices.size(); j += num_trainers) {
       for (size_t k=0; k<num_trainers; k++) {
         int idx = j+k;
-        if (idx % num_trainers == my_trainer) 
-        my_trainers_indices.insert(m_shuffled_indices[idx]);
+        if (idx % num_trainers == my_trainer)
+          my_trainers_indices.insert(m_shuffled_indices[idx]);
       }
     }
 
@@ -181,8 +191,8 @@ void smiles_data_reader::load() {
 void smiles_data_reader::do_preload_data_store() {
   double tm1 = get_time();
   if (is_master()) {
-    std::cout << "starting do_preload_data_store; num indices: " 
-              << utils::commify(m_shuffled_indices.size()) 
+    std::cout << "starting do_preload_data_store; num indices: "
+              << utils::commify(m_shuffled_indices.size())
               << " for role: " << get_role() << std::endl;
   }
 
@@ -195,7 +205,7 @@ void smiles_data_reader::do_preload_data_store() {
   }
   std::string line;
   if (m_has_header) {
-    getline(in, line); 
+    getline(in, line);
   }
 
   // Collect the (global) set of sample_ids to be used in this experiment
@@ -211,14 +221,14 @@ void smiles_data_reader::do_preload_data_store() {
     int id = m_shuffled_indices[idx];
     if (id < sanity_min) sanity_min = id;
     if (id > sanity_max) sanity_max = id;
-    if (m_data_store->get_index_owner(id) != m_comm->get_rank_in_world()) {
+    if (m_data_store->get_index_owner(id) != m_comm->get_rank_in_trainer()) {
       continue;
     }
     valid_ids.insert(id);
   }
   int max_index = sanity_max;
 
-  // cheap sanity check 
+  // cheap sanity check
   if ( (sanity_min != m_min_index || sanity_max != m_max_index)
         &&
         get_role() == "train") {
@@ -267,7 +277,7 @@ bool smiles_data_reader::fetch_datum(Mat& X, int data_id, int mb_idx) {
   // no data_store: all data is stored locally
   if (m_data_store == nullptr) {
     get_sample(data_id, data);
-    data_ptr = data.data();  
+    data_ptr = data.data();
     sz = data.size();
   }
 
@@ -287,10 +297,10 @@ bool smiles_data_reader::fetch_datum(Mat& X, int data_id, int mb_idx) {
     data_ptr = data.data();
     sz = data.size();
   }
-  
+
   size_t j;
   for (j = 0; j < sz; ++j) {
-    X(j, mb_idx) = data_ptr[j]; 
+    X(j, mb_idx) = data_ptr[j];
   }
   for (; j<static_cast<size_t>(m_linearized_data_size); j++) {
     X(j, mb_idx) = m_pad;
@@ -323,11 +333,11 @@ void smiles_data_reader::print_statistics() const {
   std::cout << "max sequence length: " << utils::commify(m_linearized_data_size) << std::endl;
   std::cout << "num features=" << utils::commify(m_linearized_data_size) << std::endl;
   if (m_delimiter == '\t') {
-    std::cout << "delimiter: <tab>\n"; 
+    std::cout << "delimiter: <tab>\n";
   } else if (m_delimiter == ',') {
-    std::cout << "delimiter: <comma>\n"; 
+    std::cout << "delimiter: <comma>\n";
   } else if (m_delimiter == '\0') {
-    std::cout << "delimiter: <none>\n"; 
+    std::cout << "delimiter: <none>\n";
   } else {
     LBANN_ERROR("invalid delimiter character, as int: ", (int)m_delimiter);
   }
@@ -356,7 +366,7 @@ void smiles_data_reader::load_vocab() {
     if (token.size() == 1) {
       m_vocab[token[0]] = id;
       m_vocab_inv[id] = token[0];
-    }  
+    }
     if (token == "<pad>") {
       m_pad = id;
       --sanity;
@@ -401,7 +411,7 @@ int smiles_data_reader::get_num_lines(std::string fn) {
     }
     in.close();
 
-    std::cout << "smiles_data_reader::get_num_lines; num_lines: " 
+    std::cout << "smiles_data_reader::get_num_lines; num_lines: "
               << utils::commify(count) << " time: " << get_time()-tm1 << std::endl;
   }
 
@@ -419,8 +429,8 @@ int smiles_data_reader::get_num_lines(std::string fn) {
   int n_lines = INT_MAX;
   if (opts->has_int("n_lines")) {
      n_lines = opts->get_int("n_lines");
-     if(is_master() && count < n_lines) { 
-       std::cout << "WARNING:: number of available samples (" << count 
+     if(is_master() && count < n_lines) {
+       std::cout << "WARNING:: number of available samples (" << count
                 << " ) in file " << fn << " is less than number of samples requested (" << n_lines
                 << " ) I am returning number of available samples " << std::endl;
        }
@@ -511,7 +521,7 @@ void smiles_data_reader::setup_local_cache() {
   double tm3 = get_time();
   if (is_master()) {
     std::cout << "\nSTARTING smiles_data_reader::setup_fast_experimental() " << std::endl << std::endl;
-  }  
+  }
 
   // This will hold: (dataum_id, datum_offset, datum length) for each sample
   std::vector<size_t> sample_offsets(m_shuffled_indices.size()*3);
@@ -531,15 +541,15 @@ void smiles_data_reader::setup_local_cache() {
     std::string line;
     if (m_has_header) {
       getline(in, line);
-    }  
+    }
 
     // Part 1: compute memory requirements for local cache
 
-    // Get max sample id, which will be the number of lines we need to 
+    // Get max sample id, which will be the number of lines we need to
     // read from file. This is needed if (1) not using 100% of data,
     // and/or (2) carving off part of train data to use as validation.
     std::unordered_set<int> samples_to_use;
-    int max_sample_id = 0; 
+    int max_sample_id = 0;
     for (size_t j=0; j<m_shuffled_indices.size(); j++) {
       samples_to_use.insert(m_shuffled_indices[j]);
       max_sample_id = m_shuffled_indices[j] > max_sample_id ? m_shuffled_indices[j] : max_sample_id;
@@ -568,8 +578,8 @@ void smiles_data_reader::setup_local_cache() {
     // Part 2: Fill in the data buffer
     in.seekg(0);
     if (m_has_header) {
-      getline(in, line); 
-    }  
+      getline(in, line);
+    }
     offset = 0;
     for (int j=0; j<max_sample_id; j++) {
       getline(in, line);
@@ -591,7 +601,7 @@ void smiles_data_reader::setup_local_cache() {
   // Construct lookup table for locating samples in the m_data vector (aka, the sample buffer)
   m_comm->broadcast<size_t>(0, sample_offsets.data(), sample_offsets.size(), m_comm->get_world_comm());
   for (size_t j=0; j<sample_offsets.size(); j += 3) {
-    m_sample_lookup[sample_offsets[j]] = 
+    m_sample_lookup[sample_offsets[j]] =
       std::make_pair(sample_offsets[j+1], sample_offsets[j+2]);
   }
 
@@ -633,7 +643,7 @@ void smiles_data_reader::setup_local_cache() {
 void smiles_data_reader::test_encode() {
   // What this does: at this point, P_0 has read and bcast the data set,
   // and each rank has built a lookup table. Below, P_1 looks up each
-  // data_id; encodes the string (E1); reads the string from file (S2); 
+  // data_id; encodes the string (E1); reads the string from file (S2);
   // decodes E1 to produce string S1; compares S1 and S2 for equality.
   double tm1 = get_time();
   if (is_master()) {
@@ -642,7 +652,7 @@ void smiles_data_reader::test_encode() {
   if (m_comm->get_rank_in_world() != 1) {
     return;
   }
-  
+
   // option: testing the test ;)
   bool fail = options::get()->get_bool("make_test_fail");
 
@@ -676,7 +686,7 @@ void smiles_data_reader::test_encode() {
       ++num_tested;
       // encode then decode the datum that is stored in memory
       get_sample(sample_id, encoded);
-      decode_smiles(encoded, decoded); 
+      decode_smiles(encoded, decoded);
 
       // get datum length from the line we've just read from file
       size_t k = get_smiles_string_length(line, sample_id);
@@ -688,7 +698,7 @@ void smiles_data_reader::test_encode() {
       if (num_tested > 10 && fail) {
         for (size_t h=0; h<S2.size(); h++) {
           S2[h] = '~';
-        }  
+        }
       }
 
       // conduct tests
@@ -708,7 +718,7 @@ void smiles_data_reader::test_encode() {
     LBANN_ERROR("num_tested= ", num_tested, "; m_sample_lookup.size()= ", m_sample_lookup.size(), "; should be equal");
   }
 
-  std::cout << "ENDING TEST_ENCODE; time: " << get_time()-tm1 
+  std::cout << "ENDING TEST_ENCODE; time: " << get_time()-tm1
             << " >>> TESTS PASSED <<< " << std::endl;
 }
 
@@ -722,7 +732,7 @@ void smiles_data_reader::decode_smiles(const std::vector<short> &data, std::stri
       for (auto tt : data) {
         s2 << tt << " ";
       }
-      s2 << "; m_vocab_inv.size(): " << m_vocab_inv.size() 
+      s2 << "; m_vocab_inv.size(): " << m_vocab_inv.size()
          << " m_vocab_inv keys: ";
       for (auto tt : m_vocab_inv) {
         s2 << tt.first << " ";
@@ -735,7 +745,7 @@ void smiles_data_reader::decode_smiles(const std::vector<short> &data, std::stri
       s << "<unk>";
     } else if (!(x == "<bos>" || x == "<eos>" || x == "<pad>")) {
       s << m_vocab_inv[t];
-    } 
+    }
   }
   out = s.str();
 }
@@ -766,7 +776,7 @@ void smiles_data_reader::get_delimiter() {
         break;
       default :
         LBANN_ERROR("Invalid delimiter character; should be 'c', 't', '0'; you passed: ", d);
-    }  
+    }
   }
   if (is_master()) {
     std::cout << "USING delimiter character: (int)" << (int)m_delimiter << std::endl;

From 0e321691f4320faf60e082dafe066a9df401e560 Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Sat, 5 Sep 2020 15:47:44 -0700
Subject: [PATCH 22/36] Added a barrier after the conduit node exchange in the
 data store. (#1617)

This eliminates the race condition that is occuring in the VAE ATOM
model with the smiles data reader.  In principle this should not be
required, but there is some odd race between the non-blocking messages
on the CPU and the GPU code.
---
 src/data_store/data_store_conduit.cpp | 75 +++++++++++++--------------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp
index f19413f1117..ef38975d8b9 100644
--- a/src/data_store/data_store_conduit.cpp
+++ b/src/data_store/data_store_conduit.cpp
@@ -79,10 +79,10 @@ data_store_conduit::data_store_conduit(
   if (opts->has_string("data_store_test_checkpoint")
       && opts->has_string("data_store_spill")) {
     LBANN_ERROR("you passed both --data_store_test_checkpoint and --data_store_spill; please use one or the other or none, but not both");
-  }  
+  }
   if (opts->has_string("data_store_test_checkpoint")) {
     setup_checkpoint_test();
-  }  
+  }
   if (opts->has_string("data_store_spill")) {
     setup_spill(opts->get_string("data_store_spill"));
   }
@@ -90,7 +90,7 @@ data_store_conduit::data_store_conduit(
   set_is_local_cache(opts->get_bool("data_store_cache"));
   set_is_preloading(opts->get_bool("preload_data_store"));
   set_is_explicitly_loading(! is_preloading());
-  
+
   if (is_local_cache()) {
     PROFILE("data_store_conduit is running in local_cache mode");
   } else {
@@ -128,7 +128,7 @@ void data_store_conduit::setup_checkpoint_test() {
   std::string c = options::get()->get_string("data_store_test_checkpoint");
   if (c == "1") {
     LBANN_ERROR("--data_store_test_checkpoint=1; you probably forgot to specify the spill directory; you must specify --data_store_test_checkpoint=<string>'");
-  } 
+  }
   if (c == "lassen") {
      c = get_lassen_spill_dir();
   }
@@ -161,8 +161,8 @@ data_store_conduit& data_store_conduit::operator=(const data_store_conduit& rhs)
   return (*this);
 }
 
-void data_store_conduit::set_data_reader_ptr(generic_data_reader *reader) { 
-  m_reader = reader; 
+void data_store_conduit::set_data_reader_ptr(generic_data_reader *reader) {
+  m_reader = reader;
   m_debug = 0;
   m_profile = 0;
   open_informational_files();
@@ -244,7 +244,7 @@ void data_store_conduit::setup_data_store_buffers() {
 void data_store_conduit::spill_preloaded_conduit_node(int data_id, const conduit::Node &node) {
   // note: at this point m_data[data_id] = node
   conduit::Node n3 = node;
-  { 
+  {
     std::lock_guard<std::mutex> lock(m_mutex);
     build_node_for_sending(node, n3);
   }
@@ -277,7 +277,7 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id, const conduit::
   if (is_local_cache()) {
     std::lock_guard<std::mutex> lock(m_mutex);
     ++m_my_num_indices;
-    m_data[data_id] = node; 
+    m_data[data_id] = node;
     return;
   }
 
@@ -287,7 +287,7 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id, const conduit::
     return;
   }
 
-  { 
+  {
     conduit::Node n2 = node;  // node == m_data[data_id]
     std::lock_guard<std::mutex> lock(m_mutex);
     build_node_for_sending(n2, m_data[data_id]);
@@ -393,9 +393,9 @@ void data_store_conduit::set_conduit_node(int data_id, const conduit::Node &node
         m_owner[data_id] = m_rank_in_trainer;
         build_node_for_sending(node, m_data[data_id]);
         m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact();
-      }  
+      }
       error_check_compacted_node(m_data[data_id], data_id);
-    }  
+    }
   }
 }
 
@@ -578,6 +578,7 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s
   tm5 = get_time();
   m_comm->wait_all(m_send_requests);
   m_comm->wait_all(m_recv_requests);
+  m_comm->trainer_barrier();
   m_wait_all_time += (get_time() - tm5);
 
   //========================================================================
@@ -941,7 +942,7 @@ void data_store_conduit::set_loading_is_complete() {
   }
 }
 
-bool data_store_conduit::is_fully_loaded() const { 
+bool data_store_conduit::is_fully_loaded() const {
   if (m_loading_is_complete) {
     return true;
   }
@@ -972,7 +973,7 @@ void data_store_conduit::get_image_sizes(map_is_t &file_sizes, std::vector<std::
       my_image_sizes.push_back(t.second[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value());
     }
   }
-  
+
   else {
     // get sizes of files for which I'm responsible
     for (size_t h=m_rank_in_trainer; h<m_shuffled_indices->size(); h += m_np_in_trainer) {
@@ -1131,7 +1132,7 @@ void data_store_conduit::exchange_local_caches() {
   PROFILE("  is_local_cache(): ", is_local_cache());
   PROFILE("  is_fully_loaded: ", is_fully_loaded());
 
-  // indices[j] will contain the indices 
+  // indices[j] will contain the indices
   // that P_j will read from disk, and subsequently bcast to all others
   std::vector<std::vector<int>> indices;
 
@@ -1201,7 +1202,7 @@ void data_store_conduit::build_conduit_nodes(map_is_t &sizes) {
   const std::vector<image_data_reader::sample_t> &image_list = image_reader->get_image_list();
   for (auto t : sizes) {
     int data_id = t.first;
-    int label = image_list[data_id].second; 
+    int label = image_list[data_id].second;
     if (m_image_offsets.find(data_id) == m_image_offsets.end()) {
       LBANN_ERROR("m_image_offsets.find(data_id) == m_image_offsets.end() for data_id: ", data_id);
     }
@@ -1435,7 +1436,7 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_
     PROFILE("  is_fully_loaded: ", is_fully_loaded());
     if (! is_local_cache()) {
       profile_timing();
-    }  
+    }
   }
 
   double tm1 = get_time();
@@ -1445,11 +1446,11 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_
     PROFILE("calling exchange_owner_maps");
     if (!m_owner_maps_were_exchanged) {
       exchange_owner_maps();
-    } 
+    }
 
-    else {  
+    else {
       PROFILE("  owner_maps were already exchanged; returning");
-    }  
+    }
     m_owner_maps_were_exchanged = true;
 PROFILE("exchange_mini_batch_data; m_owner_maps_were_exchanged = true");
     /*
@@ -1458,7 +1459,7 @@ PROFILE("exchange_mini_batch_data; m_owner_maps_were_exchanged = true");
       m_is_spilled = true;
       m_metadata.close();
       save_state();
-    }  
+    }
     */
   }
 
@@ -1517,7 +1518,7 @@ void data_store_conduit::test_checkpoint(const std::string &checkpoint_dir) {
   }
 
   if (m_world_master) {
-    std::cout << "Cleared the owner map; m_owner.size(): " << m_owner.size() 
+    std::cout << "Cleared the owner map; m_owner.size(): " << m_owner.size()
               << std::endl
               << "Calling load_checkpoint" << std::endl;
   }
@@ -1571,7 +1572,7 @@ void data_store_conduit::setup_spill(std::string base_dir) {
   // open metadata file; this will contains the file pathnames of spilled
   // conduit nodes
   const std::string fnn = get_metadata_fn();
-  m_metadata.open(fnn.c_str()); 
+  m_metadata.open(fnn.c_str());
   if (!m_metadata) {
     LBANN_ERROR("failed to open ", fnn, " for writing");
   }
@@ -1614,15 +1615,15 @@ void data_store_conduit::save_state() {
   {
   cereal::XMLOutputArchive archive(os);
     archive(CEREAL_NVP(m_my_num_indices),
-            CEREAL_NVP(m_owner_maps_were_exchanged), 
+            CEREAL_NVP(m_owner_maps_were_exchanged),
             CEREAL_NVP(m_is_setup),
-            CEREAL_NVP(m_preloading), 
-            CEREAL_NVP(m_loading_is_complete), 
+            CEREAL_NVP(m_preloading),
+            CEREAL_NVP(m_loading_is_complete),
             CEREAL_NVP(m_explicitly_loading),
-            CEREAL_NVP(m_owner_map_mb_size), 
-            CEREAL_NVP(m_compacted_sample_size), 
+            CEREAL_NVP(m_owner_map_mb_size),
+            CEREAL_NVP(m_compacted_sample_size),
             CEREAL_NVP(m_is_local_cache),
-            CEREAL_NVP(m_node_sizes_vary), 
+            CEREAL_NVP(m_node_sizes_vary),
             CEREAL_NVP(m_have_sample_sizes),
             CEREAL_NVP(m_owner),
             CEREAL_NVP(m_sample_sizes));
@@ -1670,7 +1671,7 @@ void data_store_conduit::load_checkpoint(std::string dir_name, generic_data_read
     m_rank_in_trainer = m_comm->get_rank_in_trainer();
     m_rank_in_world = m_comm->get_rank_in_world();
     m_np_in_trainer = m_comm->get_procs_per_trainer();
-  }  
+  }
 
   // Open metadata filename; this is in index re, checkpointed conduit filenames
   const std::string metadata_fn = get_metadata_fn();
@@ -1720,7 +1721,7 @@ std::string data_store_conduit::get_conduit_dir() const {
 }
 
 std::string data_store_conduit::get_cereal_fn() const {
-  return m_spill_dir_base + '/' + m_cereal_fn + "_" + m_reader->get_role() + "_" + std::to_string(m_rank_in_world) + ".xml"; 
+  return m_spill_dir_base + '/' + m_cereal_fn + "_" + m_reader->get_role() + "_" + std::to_string(m_rank_in_world) + ".xml";
 }
 
 std::string data_store_conduit::get_metadata_fn() const {
@@ -1851,7 +1852,7 @@ void data_store_conduit::test_imagenet_node(int index, bool dereference) {
     std::cout << "; (>= INT_MAX)\n";
   } else {
     std::cout << std::endl;
-  }  
+  }
   conduit::Node nd1;
   image_reader->load_conduit_node_from_file(data_id, nd1);
   char *buf1 = nd1[LBANN_DATA_ID_STR(data_id) + "/buffer"].value();
@@ -1870,7 +1871,7 @@ void data_store_conduit::test_imagenet_node(int index, bool dereference) {
       const conduit::Schema &s = nd2.schema();
       s.print();
       nd2.print();
-    }  
+    }
 
 
 
@@ -1940,13 +1941,13 @@ void data_store_conduit::check_query_flags() const {
   }
 }
 
-void data_store_conduit::clear_owner_map() { 
+void data_store_conduit::clear_owner_map() {
     m_owner_maps_were_exchanged = false;
-    m_owner.clear(); 
+    m_owner.clear();
 }
 
 void data_store_conduit::verify_sample_size() {
-  // Note: m_compacted_sample_size is set during calls to set_conduit_node() or 
+  // Note: m_compacted_sample_size is set during calls to set_conduit_node() or
   //  set_preloaded_conduit_node(). Hence, if these are not called (i.e, the
   //  rank does not own any data), m_compacted_sample_size will be zero.
   //  This method ensures that all ranks know the sample size, whether or not
@@ -1979,11 +1980,9 @@ size_t data_store_conduit::get_mem_usage() {
       LBANN_ERROR("node does not have a valid contiguous data pointer");
     }
     r += nd.total_bytes_compact();
-  }  
+  }
   return r;
 }
 
 
 }  // namespace lbann
-
-

From 8cbedb5ad22c6e6a0ed1a2ced257d3f6f407ec68 Mon Sep 17 00:00:00 2001
From: Yosuke Oyama <17844184+oyamay@users.noreply.github.com>
Date: Mon, 7 Sep 2020 17:03:49 +0900
Subject: [PATCH 23/36] Merging data_reader_hdf5 and slab extensions (cont'd)
 (#1572)

* Merge changes from distconv

* Add hdf5_data_reader

* Merge changes in distconv

* Enable slab-based reading when data_reader_hdf5 is used

* Get the max mini-batch size via the trainer in Distconv adapters

* Pass the maximum mini-batch size to models explicitly

* Instantiate base_convolution_adapter

* Fixed the function calls that set the owner map.  Corrected a comment
that caused merge conflict.

* Fixed the copy_member function and restore from save functions to
properly manage new fields for partitioned entries in the data store.

Co-authored-by: Naoya Maruyama <maruyama3@llnl.gov>
Co-authored-by: Brian C. Van Essen <vanessen1@llnl.gov>
---
 include/lbann/data_readers/CMakeLists.txt     |   5 +
 .../lbann/data_readers/data_reader_hdf5.hpp   |  84 +++++
 .../lbann/data_store/data_store_conduit.hpp   | 135 ++++---
 .../layers/data_type_distconv_adapter.hpp     |   2 +
 include/lbann/lbann.hpp                       |   3 +
 include/lbann/models/model.hpp                |  11 +
 src/data_readers/CMakeLists.txt               |   5 +
 src/data_readers/data_reader_hdf5.cpp         | 335 ++++++++++++++++++
 src/data_store/data_store_conduit.cpp         | 126 +++++--
 src/io/data_buffers/partitioned_io_buffer.cpp |  30 ++
 src/layers/data_type_distconv_adapter.cpp     |  27 +-
 src/layers/io/input/input_layer.cpp           |  15 +
 src/layers/learning/base_convolution.cpp      |   8 +
 src/models/model.cpp                          |   1 +
 src/proto/proto_common.cpp                    |  11 +
 15 files changed, 705 insertions(+), 93 deletions(-)
 create mode 100644 include/lbann/data_readers/data_reader_hdf5.hpp
 create mode 100644 src/data_readers/data_reader_hdf5.cpp

diff --git a/include/lbann/data_readers/CMakeLists.txt b/include/lbann/data_readers/CMakeLists.txt
index 7d56b6ebf46..f22597a6d9d 100644
--- a/include/lbann/data_readers/CMakeLists.txt
+++ b/include/lbann/data_readers/CMakeLists.txt
@@ -19,5 +19,10 @@ set_full_path(THIS_DIR_HEADERS
   data_reader_smiles.hpp
   )
 
+if (LBANN_HAS_DISTCONV)
+  list(APPEND THIS_DIR_HEADERS
+    "${CMAKE_CURRENT_SOURCE_DIR}/data_reader_hdf5.hpp")
+endif ()
+
 # Propagate the files up the tree
 set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/data_readers/data_reader_hdf5.hpp b/include/lbann/data_readers/data_reader_hdf5.hpp
new file mode 100644
index 00000000000..e01de13ac7c
--- /dev/null
+++ b/include/lbann/data_readers/data_reader_hdf5.hpp
@@ -0,0 +1,84 @@
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+#ifndef LBANN_DATA_READER_HDF5_HPP
+#define LBANN_DATA_READER_HDF5_HPP
+#include "data_reader_image.hpp"
+#include "hdf5.h"
+#include "conduit/conduit.hpp"
+
+namespace lbann {
+/**
+ * Data reader for data stored in hdf5 files will need to assume the file contains x
+ */
+class hdf5_reader : public generic_data_reader {
+ public:
+  hdf5_reader(const bool shuffle);
+  hdf5_reader(const hdf5_reader&);
+  hdf5_reader& operator=(const hdf5_reader&);
+  ~hdf5_reader() override {}
+
+  hdf5_reader* copy() const override { return new hdf5_reader(*this); }
+
+  void copy_members(const hdf5_reader& rhs);
+
+  std::string get_type() const override {
+    return "data_reader_hdf5_images";
+  }
+  //void set_input_params(int width, int height, int depth, int num_ch, int num_labels);
+  void load() override;
+  void set_hdf5_paths(const std::vector<std::string> hdf5_paths) {m_file_paths = hdf5_paths;}
+
+  int get_num_responses() const override {
+    return get_linearized_response_size();
+  }
+  int get_linearized_data_size() const override {
+    return m_num_features;
+  }
+  int get_linearized_response_size() const override {
+    return m_num_response_features;
+  }
+  const std::vector<int> get_data_dims() const override {
+    return m_data_dims;
+  }
+ protected:
+  void read_hdf5_hyperslab(hsize_t h_data, hsize_t filespace, int rank,
+                           short *sample);
+  void read_hdf5_sample(int data_id, short *sample);
+  //void set_defaults() override;
+  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
+  void fetch_datum_conduit(Mat& X, int data_id);
+  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
+  bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override;
+  void gather_responses(float *responses);
+  /// Whether to fetch a label from the last column.
+  bool m_has_labels = false;
+  /// Whether to fetch a response from the last column.
+  bool m_has_responses = true;
+  int m_image_depth=0;
+  size_t m_num_features;
+  static constexpr int m_num_response_features = 4;
+  float m_all_responses[m_num_response_features];
+  std::vector<std::string> m_file_paths;
+  MPI_Comm m_comm;
+  std::vector<int> m_data_dims;
+  std::vector<hsize_t> m_hyperslab_dims;
+  hid_t m_fapl;
+  hid_t m_dxpl;
+  MPI_Comm m_response_gather_comm;
+  bool m_use_data_store;
+ private:
+  static const std::string HDF5_KEY_DATA, HDF5_KEY_LABELS, HDF5_KEY_RESPONSES;
+};
+}
+#endif // LBANN_DATA_READER_HDF5_HPP
diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp
index df0f1ced1d4..65716e06965 100644
--- a/include/lbann/data_store/data_store_conduit.hpp
+++ b/include/lbann/data_store/data_store_conduit.hpp
@@ -48,6 +48,16 @@ namespace lbann {
 
 class generic_data_reader;
 
+/** Create a hash function for hashing a std::pair type */
+struct size_t_pair_hash
+{
+  template <class T1, class T2>
+  std::size_t operator() (const std::pair<T1, T2> &pair) const
+  {
+    return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
+  }
+};
+
 class data_store_conduit {
 
  public:
@@ -56,6 +66,9 @@ class data_store_conduit {
   using map_ii_t = std::unordered_map<int,int>;
   using map_is_t = std::unordered_map<int,size_t>;
 
+  // Hash map for tracking the node and hyperslab partition ID
+  using map_pssi_t = std::unordered_map<std::pair<size_t,size_t>,int,size_t_pair_hash>;
+
   // not currently used; will be in the future
   using map_ss_t = std::unordered_map<size_t,size_t>;
 
@@ -113,13 +126,13 @@ class data_store_conduit {
   //=================================================================
   // methods for setting and querying the data store's mode
   //=================================================================
-  /** @brief Returns true if preloading is turned on 
+  /** @brief Returns true if preloading is turned on
    *
    * See notes in: is_explicitly_loading()
    */
   bool is_preloading() const { return m_preloading; }
 
-  /** @brief Returns true if explicitly loading is turned on 
+  /** @brief Returns true if explicitly loading is turned on
    *
    * 'explicitly loading' means that the data that will be owned
    * by each rank is passed into the data store during the first epoch.
@@ -130,7 +143,7 @@ class data_store_conduit {
    */
   bool is_explicitly_loading() const { return m_explicitly_loading; }
 
-  /** @brief Returns true if all loading has been completed 
+  /** @brief Returns true if all loading has been completed
    *
    * See notes in: set_loading_is_complete()
    */
@@ -143,35 +156,34 @@ class data_store_conduit {
    * but part of the set may be spilled to disk if memory is
    * insufficient. Local cache mode is activated via the cmd line
    * flag: --data_store_cache
-   */ 
+   */
   bool is_local_cache() const { return m_is_local_cache; }
 
-  /** @brief Turn preloading on or off */ 
+  /** @brief Turn preloading on or off */
   void set_is_preloading(bool flag);
 
-  /** @brief Turn on explicit loading */ 
+  /** @brief Turn on explicit loading */
   void set_is_explicitly_loading(bool flag);
 
   /** @brief Marks the data_store as fully loaded
    *
    * Fully loaded means that each rank has all the data that it
    * is intended to own. When not running in local cache mode, this
-   * occurs (1) at the conclusion of preloading, prior to the beginning of 
-   * the first epoch, or (2) at the conclusion of the first epoch, if 
-   * explicitly loading. When running in local cache mode, this occurs 
-   * (1) at the conclusion of preload_local_cache(), which is called prior 
+   * occurs (1) at the conclusion of preloading, prior to the beginning of
+   * the first epoch, or (2) at the conclusion of the first epoch, if
+   * explicitly loading. When running in local cache mode, this occurs
+   * (1) at the conclusion of preload_local_cache(), which is called prior
    * to the first epoch, or (2) at the conclusion of exchange_local_caches(),
    * at th conclusion of the first epoch, if explicitly loading.
    */
-  void set_loading_is_complete(); 
-
+  void set_loading_is_complete();
 
   /** @brief turns local cache mode on of off */
   void set_is_local_cache(bool flag = true) { m_is_local_cache = flag; }
 
   /** @brief Check that explicit loading, preloading, and fully loaded flags are consistent */
   void check_query_flags() const;
-   
+
   //=================================================================
   // END methods for setting and querying the data store's mode
   //=================================================================
@@ -184,15 +196,23 @@ class data_store_conduit {
   void build_preloaded_owner_map(const std::vector<int>& per_rank_list_sizes);
 
   /// fills in m_owner, which maps index -> owning processor
-  void set_preloaded_owner_map(const std::unordered_map<int,int> &owner) { m_owner = owner; }
+  void set_preloaded_owner_map(const std::unordered_map<int,int> &owner) {
+    for(auto&& i : owner) {
+      m_owner[std::make_pair(i.first, m_offset_in_partition)] = i.second;
+    }
+  }
 
   /** @brief Special hanling for ras_lipid_conduit_data_reader; may go away in the future */
   void clear_owner_map();
 
-  void set_owner_map(const std::unordered_map<int, int> &m) { m_owner = m; }
+  void set_owner_map(const std::unordered_map<int, int> &m) {
+    for(auto&& i : m) {
+      m_owner[std::make_pair(i.first, m_offset_in_partition)] = i.second;
+    }
+  }
 
   /** @brief Special handling for ras_lipid_conduit_data_reader; may go away in the future */
-  void add_owner(int data_id, int owner) { m_owner[data_id] = owner; }
+  void add_owner(int data_id, int owner) { m_owner[std::make_pair(data_id, m_offset_in_partition)] = owner; }
 
   /** @brief Special handling for ras_lipid_conduit_data_reader; may go away in the future */
   void set_finished_building_map() { m_owner_maps_were_exchanged = true; }
@@ -213,7 +233,7 @@ class data_store_conduit {
    */
   void preload_local_cache();
 
-  void exchange_mini_batch_data(size_t current_pos, size_t mb_size); 
+  void exchange_mini_batch_data(size_t current_pos, size_t mb_size);
 
   void set_node_sizes_vary() { m_node_sizes_vary = true; }
 
@@ -235,17 +255,17 @@ class data_store_conduit {
    *
    * Debug logging is enabled on all ranks via the cmd line flag: --data_store_debug
    */
-  void flush_debug_file(); 
+  void flush_debug_file();
 
   /** @brief Closes then reopens the profile logging file
    *
    * Profile logging is enabled on P_0 via the cmd line flag: --data_store_profile
    */
-  void flush_profile_file() const; 
+  void flush_profile_file() const;
 
   /** @brief Writes object's state to file */
   void write_checkpoint(std::string dir_name);
-  
+
   /** @brief Loads object's state from file */
   void load_checkpoint(std::string dir_name, generic_data_reader *reader = nullptr);
 
@@ -260,7 +280,7 @@ class data_store_conduit {
    * @param n is the maximum number of samples to test; set to -1 to test all
    * @return true, if all samples read from file match those constructed from
    *               the local shared memory segment (aka, cache)
-   */ 
+   */
   bool test_local_cache_imagenet(int n);
 
   void test_imagenet_node(int sample_id, bool dereference = true);
@@ -298,7 +318,7 @@ private :
   /** @brief Used to form the directory path for spilling conduit nodes */
   int m_cur_spill_dir_integer = -1;
 
-  /** @brief @brief Current directory for spilling (writing to file) conduit nodes 
+  /** @brief @brief Current directory for spilling (writing to file) conduit nodes
    *
    * m_cur_spill_dir = m_spill_dir_base/<m_cur_spill_dir_integer>
    */
@@ -360,14 +380,14 @@ private :
   double m_rebuild_time = 0;
 
   // total time for exchange_mini_batch_data
-  double m_exchange_time = 0; 
+  double m_exchange_time = 0;
 
-  // sanity check: 
+  // sanity check:
   //   m_start_snd_rcv_time + m_wait_all_time + m_rebuild_time
   // should be only slightly less than m_exchange_time;
   // Note that, for imagenet, the first call to exchange_data_by_sample
   // involves additional communication for exchanging sample sizes
- 
+
   //===========================================================
   // END: timers for profiling exchange_data
   //===========================================================
@@ -380,7 +400,7 @@ private :
   /** @brief True, if we are in preload mode */
   bool m_preloading = false;
 
-  /** @brief True, if we are in explicit loading mode 
+  /** @brief True, if we are in explicit loading mode
    *
    * There is some redundancy here: m_preloading and m_explicitly_loading
    * can not both be true, but both may be false. When m_loading_is_complete
@@ -412,22 +432,35 @@ private :
   bool m_world_master;
   bool m_trainer_master;
   int  m_rank_in_trainer;
-  int  m_rank_in_world = -1; // -1 for debugging 
+  int  m_rank_in_world = -1; // -1 for debugging
+  int  m_partition_in_trainer;
+  int  m_offset_in_partition;
+
+  /// number of procs in the trainer; convenience handle
   int  m_np_in_trainer;
+  int  m_num_partitions_in_trainer;
 
-  /** @brief Maps an index to the processor that owns the associated data */ 
-  map_ii_t m_owner;
+  /** @brief Maps an index to the processor that owns the associated data
+   * First value of index is the sample ID and second value is the partiton ID
+   *
+   * Must be mutable since rhs.m_owner may be modified in copy_members,
+   * in which rhs is const.
+   */
+  mutable map_pssi_t m_owner;
 
   /// convenience handle
   const std::vector<int> *m_shuffled_indices;
 
   /** @brief Contains the conduit nodes that are "owned" by this rank
    *
-   * Maps data_id -> conduit::Node.
-   */ 
-  std::unordered_map<int, conduit::Node> m_data;
+   * Map data_id -> conduit::Node.
+   * Must be mutable since rhs.m_owner may be modified in copy_members,
+   * in which rhs is const.
+   */
+  mutable std::unordered_map<int, conduit::Node> m_data;
 
-  /** @brief Contains the conduit nodes that are "owned" by this rank
+  /** @brief Contains a cache of the conduit nodes that are
+   * "owned" by this rank
    *
    * This differs from m_data in that this holds temporarily,
    * during the first epoch, if we're running in local cache mode
@@ -452,11 +485,11 @@ private :
   std::vector<size_t> m_outgoing_msg_sizes;
   std::vector<size_t> m_incoming_msg_sizes;
 
-  /** @brief Maps a data_id to its image size 
+  /** @brief Maps a data_id to its image size
    *
    * Used when conduit Nodes have non-uniform size, e.g, imagenet;
    * see: set_node_sizes_vary()
-   */ 
+   */
   map_is_t m_sample_sizes;
 
   /** @brief Maps a data_id to the image location in a shared memory segment */
@@ -472,7 +505,7 @@ private :
   std::vector<std::unordered_set<int>> m_indices_to_recv;
 
   //=========================================================================
-  // methods follow 
+  // methods follow
   //=========================================================================
 
   void exchange_data_by_sample(size_t current_pos, size_t mb_size);
@@ -512,29 +545,27 @@ private :
   void compute_image_offsets(map_is_t &image_sizes, std::vector<std::vector<int>> &indices);
 
   /// for use in local cache mode
-  void exchange_images(std::vector<char> &work, map_is_t &image_sizes, std::vector<std::vector<int>> &indices); 
+  void exchange_images(std::vector<char> &work, map_is_t &image_sizes, std::vector<std::vector<int>> &indices);
 
-  /// for use in local cache mode
   void build_conduit_nodes(map_is_t &sizes);
 
-
   /// for use in local cache mode
   void fillin_shared_images(char* images, size_t size, size_t offset);
 
   /** @brief For testing during development
    *
-   * At the beginning of the 2nd epoch, calls write_checkpoint(), 
-   * clears some variables, calls load_checkpoint then continues. 
+   * At the beginning of the 2nd epoch, calls write_checkpoint(),
+   * clears some variables, calls load_checkpoint then continues.
    * To activate this test use cmd flag: --data_store_test_checkpoint=
-   */ 
+   */
   void test_checkpoint(const std::string&);
 
   /** @brief Called by test_checkpoint */
   void print_variables();
 
-  /** @brief Called by test_checkpoint 
+  /** @brief Called by test_checkpoint
    *
-   * For testing and development. Prints the first 'n' entries from 
+   * For testing and development. Prints the first 'n' entries from
    * the owner map * (which maps sample_id -> owning rank) to std::cout
    */
   void print_partial_owner_map(int n);
@@ -544,7 +575,7 @@ private :
   std::string get_metadata_fn() const;
 
   /** @brief Creates the directory if it does not already exist */
-  void make_dir_if_it_doesnt_exist(const std::string &dir); 
+  void make_dir_if_it_doesnt_exist(const std::string &dir);
 
   /** @brief Writes conduit node to file */
   void spill_conduit_node(const conduit::Node &node, int data_id);
@@ -554,8 +585,8 @@ private :
 
   /** @brief Creates directory structure, opens metadata file for output, etc
    *
-   * This method is called for both --data_store_spill and 
-   * --data_store_test_checkpoint 
+   * This method is called for both --data_store_spill and
+   * --data_store_test_checkpoint
    */
   void setup_spill(std::string dir);
 
@@ -572,7 +603,7 @@ private :
    * files are opened if the cmd flag --data_store_debug is passed.
    * A profiling file is opened only be <world_master, data reader role>
    * pairs; files are opened if the cmd flag --data_store_profile is passed.
-   */ 
+   */
   void open_informational_files();
 
   /** @brief Creates a directory for spilling conduit nodes */
@@ -591,11 +622,11 @@ private :
   // functions and templates for optional profiling and debug files follow
   //=========================================================================
 
-  void PROFILE() const { 
+  void PROFILE() const {
     if (!m_profile) {
       return;
     }
-    (*m_profile) << std::endl; 
+    (*m_profile) << std::endl;
     flush_profile_file();
   }
 
@@ -612,11 +643,11 @@ private :
     flush_profile_file();
   }
 
-  void DEBUG_DS() { 
+  void DEBUG_DS() {
     if (!m_debug) {
       return;
     }
-    (*m_debug) << std::endl; 
+    (*m_debug) << std::endl;
     flush_debug_file();
   }
 
diff --git a/include/lbann/layers/data_type_distconv_adapter.hpp b/include/lbann/layers/data_type_distconv_adapter.hpp
index a120965ad67..f483bf9f0c6 100644
--- a/include/lbann/layers/data_type_distconv_adapter.hpp
+++ b/include/lbann/layers/data_type_distconv_adapter.hpp
@@ -156,6 +156,8 @@ class data_type_distconv_adapter: public distconv_adapter {
 
   void set_activations_outermost_dimension(size_t dim);
   void set_error_signals_outermost_dimension(size_t dim);
+
+  size_t get_max_mini_batch_size() const;
 };
 
 } // namespace lbann
diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp
index a9dbdf2c553..1496a7211d3 100644
--- a/include/lbann/lbann.hpp
+++ b/include/lbann/lbann.hpp
@@ -136,6 +136,9 @@
 #include "lbann/data_readers/data_reader_pilot2_molecular.hpp"
 #include "lbann/data_readers/data_reader_mesh.hpp"
 #include "lbann/data_readers/data_reader_python.hpp"
+#ifdef LBANN_HAS_DISTCONV
+#include "lbann/data_readers/data_reader_hdf5.hpp"
+#endif // LBANN_HAS_DISTCONV
 #include "lbann/data_readers/data_reader_smiles.hpp"
 
 /// Data stores
diff --git a/include/lbann/models/model.hpp b/include/lbann/models/model.hpp
index de30914d0c8..10cfa4896c1 100644
--- a/include/lbann/models/model.hpp
+++ b/include/lbann/models/model.hpp
@@ -415,6 +415,11 @@ class model {
   /** @brief Execute callbacks at the end of weight optimization. */
   virtual void do_weight_optimize_end_cbs(weights *w);
 
+#ifdef LBANN_HAS_DISTCONV
+  /* @brief Return the maximum mini-batch size used by Distconv. */
+  size_t get_max_mini_batch_size_distconv() const { return m_max_mini_batch_size_distconv; }
+#endif
+
 private:
 
   /** Pointer to the execution context object used for training or evaluating this model */
@@ -504,6 +509,12 @@ class model {
   void setup_distconv();
   void setup_distributions();
   void print_distributions() const;
+
+  /** @brief The maximum mini-batch size used by Distconv.
+   *  @details This should be set before setup_distconv() is called.
+   */
+  size_t m_max_mini_batch_size_distconv;
+
 #endif // LBANN_HAS_DISTCONV
 };
 
diff --git a/src/data_readers/CMakeLists.txt b/src/data_readers/CMakeLists.txt
index 780d74e0b1a..2b27750695e 100644
--- a/src/data_readers/CMakeLists.txt
+++ b/src/data_readers/CMakeLists.txt
@@ -21,5 +21,10 @@ set_full_path(THIS_DIR_SOURCES
   data_reader_smiles.cpp
   )
 
+if (LBANN_HAS_DISTCONV)
+  list(APPEND THIS_DIR_SOURCES
+    "${CMAKE_CURRENT_SOURCE_DIR}/data_reader_hdf5.cpp")
+endif ()
+
 # Propagate the files up the tree
 set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
diff --git a/src/data_readers/data_reader_hdf5.cpp b/src/data_readers/data_reader_hdf5.cpp
new file mode 100644
index 00000000000..722b1b79e42
--- /dev/null
+++ b/src/data_readers/data_reader_hdf5.cpp
@@ -0,0 +1,335 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////
+#include "lbann/data_readers/data_reader_hdf5.hpp"
+#include "lbann/utils/profiling.hpp"
+#include "lbann/utils/distconv.hpp"
+#include "conduit/conduit_relay.hpp"
+#include "conduit/conduit_relay_io_hdf5.hpp"
+
+#include <cstdio>
+#include <string>
+#include <fstream>
+#include <unordered_set>
+#include <iostream>
+#include <cstring>
+
+namespace {
+inline hid_t check_hdf5(hid_t hid, const char *file, int line) {
+  if (hid < 0) {
+    std::cerr << "HDF5 error" << std::endl;
+    std::cerr << "Error at " << file << ":" << line << std::endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+  return hid;
+}
+} // namespace
+
+#define CHECK_HDF5(call) check_hdf5(call, __FILE__, __LINE__)
+
+namespace lbann {
+
+const std::string hdf5_reader::HDF5_KEY_DATA = "full";
+const std::string hdf5_reader::HDF5_KEY_RESPONSES = "unitPar";
+
+hdf5_reader::hdf5_reader(const bool shuffle)
+    : generic_data_reader(shuffle),
+      m_use_data_store(options::get()->get_bool("use_data_store")) {
+}
+
+hdf5_reader::hdf5_reader(const hdf5_reader& rhs)  : generic_data_reader(rhs) {
+  copy_members(rhs);
+}
+
+hdf5_reader& hdf5_reader::operator=(const hdf5_reader& rhs) {
+  // check for self-assignment
+  if (this == &rhs) {
+    return (*this);
+  }
+  generic_data_reader::operator=(rhs);
+  copy_members(rhs);
+  return (*this);
+}
+
+void hdf5_reader::copy_members(const hdf5_reader &rhs) {
+  if(rhs.m_data_store != nullptr) {
+    m_data_store = new data_store_conduit(rhs.get_data_store());
+  }
+  m_data_store->set_data_reader_ptr(this);
+
+  m_has_labels = rhs.m_has_labels;
+  m_has_responses = rhs.m_has_responses;
+  m_num_features = rhs.m_num_features;
+  m_data_dims = rhs.m_data_dims;
+  m_hyperslab_dims = rhs.m_hyperslab_dims;
+  m_comm = rhs.m_comm;
+  m_file_paths = rhs.m_file_paths;
+  m_use_data_store = rhs.m_use_data_store;
+
+  for(size_t i = 0; i < m_num_response_features; i++) {
+    m_all_responses[i] = rhs.m_all_responses[i];
+  }
+}
+
+void hdf5_reader::read_hdf5_hyperslab(hsize_t h_data, hsize_t filespace,
+                                      int rank, short *sample) {
+  prof_region_begin("read_hdf5_hyperslab", prof_colors[0], false);
+  // this is the splits, right now it is hard coded to split along the
+  // z axis
+  int num_io_parts = dc::get_number_of_io_partitions();
+
+  // how many times the pattern should repeat in the hyperslab
+  hsize_t count[4] = {1,1,1,1};
+
+  // necessary for the hdf5 lib
+  hid_t memspace = H5Screate_simple(4, m_hyperslab_dims.data(), NULL);
+  int spatial_offset = rank % num_io_parts;
+  hsize_t offset[4] = {0, m_hyperslab_dims[1] * spatial_offset, 0, 0};
+
+  // from an explanation of the hdf5 select_hyperslab:
+  // start -> a starting location for the hyperslab
+  // stride -> the number of elements to separate each element or block to be selected
+  // count -> the number of elemenets or blocks to select along each dimension
+  // block -> the size of the block selected from the dataspace
+  //hsize_t status;
+
+  CHECK_HDF5(H5Sselect_hyperslab(filespace, H5S_SELECT_SET,
+                                 offset, NULL, count,
+                                 m_hyperslab_dims.data()));
+
+  CHECK_HDF5(H5Dread(h_data, H5T_NATIVE_SHORT, memspace,
+                     filespace, m_dxpl, sample));
+  prof_region_end("read_hdf5_hyperslab", false);
+}
+
+void hdf5_reader::read_hdf5_sample(int data_id, short *sample) {
+  int world_rank = get_comm()->get_rank_in_trainer();
+  auto file = m_file_paths[data_id];
+  hid_t h_file = CHECK_HDF5(H5Fopen(file.c_str(), H5F_ACC_RDONLY, m_fapl));
+
+  // load in dataset
+  hid_t h_data = CHECK_HDF5(
+      H5Dopen(h_file, HDF5_KEY_DATA.c_str(), H5P_DEFAULT));
+  hid_t filespace = CHECK_HDF5(H5Dget_space(h_data));
+  //get the number of dimesnionse from the dataset
+  int rank1 = H5Sget_simple_extent_ndims(filespace);
+  hsize_t dims[rank1];
+  // read in what the dimensions are
+  CHECK_HDF5(H5Sget_simple_extent_dims(filespace, dims, NULL));
+
+  read_hdf5_hyperslab(h_data, filespace, world_rank, sample);
+  //close data set
+  CHECK_HDF5(H5Dclose(h_data));
+
+  if (m_has_responses) {
+    h_data = CHECK_HDF5(H5Dopen(h_file, HDF5_KEY_RESPONSES.c_str(), H5P_DEFAULT));
+    CHECK_HDF5(H5Dread(h_data, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, m_all_responses));
+    CHECK_HDF5(H5Dclose(h_data));
+  }
+  CHECK_HDF5(H5Fclose(h_file));
+  return;
+}
+
+void hdf5_reader::load() {
+  lbann_comm* l_comm = get_comm();
+  MPI_Comm mpi_comm = l_comm->get_trainer_comm().GetMPIComm();
+  int world_rank = l_comm->get_rank_in_trainer();
+  int color = world_rank / dc::get_number_of_io_partitions();
+  MPI_Comm_split(mpi_comm, color, world_rank, &m_comm);
+  m_shuffled_indices.clear();
+  m_shuffled_indices.resize(m_file_paths.size());
+  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  int nprocs;
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  if ((nprocs % dc::get_number_of_io_partitions()) !=0) {
+    LBANN_ERROR("nprocs should be divisible by num of io partitions otherwise this wont work");
+  }
+
+  // Read the dimension size of the first sample,
+  // assuming that all of the samples have the same dimension size
+  if (m_file_paths.size() > 0) {
+    const hid_t h_file = CHECK_HDF5(H5Fopen(m_file_paths[0].c_str(),
+                                            H5F_ACC_RDONLY, H5P_DEFAULT));
+    const hid_t h_data = CHECK_HDF5(H5Dopen(h_file, HDF5_KEY_DATA.c_str(),
+                                            H5P_DEFAULT));
+    const hid_t h_space = CHECK_HDF5(H5Dget_space(h_data));
+    if (CHECK_HDF5(H5Sget_simple_extent_ndims(h_space)) != 4) {
+      LBANN_ERROR("The number of dimensions of HDF5 data samples should be 4");
+    }
+    hsize_t dims[4];
+    CHECK_HDF5(H5Sget_simple_extent_dims(h_space, dims, NULL));
+    CHECK_HDF5(H5Dclose(h_data));
+    m_data_dims = std::vector<int>(dims, dims+4);
+  } else {
+    LBANN_ERROR("The number of HDF5 samples should not be zero");
+  }
+
+  m_num_features = std::accumulate(m_data_dims.begin(),
+                                   m_data_dims.end(),
+                                   (size_t) 1,
+                                   std::multiplies<size_t>());
+
+
+  for (auto i: m_data_dims) {
+    m_hyperslab_dims.push_back(i);
+  }
+  // Partition the z dimension
+  m_hyperslab_dims[1] /= dc::get_number_of_io_partitions();
+
+#define DATA_READER_HDF5_USE_MPI_IO
+#ifdef DATA_READER_HDF5_USE_MPI_IO
+  m_fapl = CHECK_HDF5(H5Pcreate(H5P_FILE_ACCESS));
+  CHECK_HDF5(H5Pset_fapl_mpio(m_fapl, m_comm, MPI_INFO_NULL));
+  m_dxpl = CHECK_HDF5(H5Pcreate(H5P_DATASET_XFER));
+  CHECK_HDF5(H5Pset_dxpl_mpio(m_dxpl, H5FD_MPIO_INDEPENDENT));  // H5FD_MPIO_COLLECTIVE
+#else
+  m_fapl = H5P_DEFAULT;
+  m_dxpl = H5P_DEFAULT;
+#endif
+  std::vector<int> local_list_sizes;
+  options *opts = options::get();
+  if (opts->get_bool("preload_data_store")) {
+    LBANN_ERROR("preload_data_store not supported on HDF5 data reader");
+  }
+  if (m_use_data_store) {
+    instantiate_data_store();
+  }
+
+  select_subset_of_data();
+  MPI_Comm_dup(dc::get_mpi_comm(), &m_response_gather_comm);
+}
+
+bool hdf5_reader::fetch_label(Mat& Y, int data_id, int mb_idx) {
+  return true;
+}
+
+bool hdf5_reader::fetch_datum(Mat& X, int data_id, int mb_idx) {
+  prof_region_begin("fetch_datum", prof_colors[0], false);
+
+  // In the Cosmoflow case, each minibatch should have only one
+  // sample per rank.
+  assert_eq(X.Width(), 1);
+  assert_eq(X.Height(),
+            m_num_features / dc::get_number_of_io_partitions()
+            / (sizeof(DataType) / sizeof(short)));
+
+  if (m_use_data_store) {
+    fetch_datum_conduit(X, data_id);
+  } else {
+    read_hdf5_sample(data_id, (short*)X.Buffer());
+  }
+  prof_region_end("fetch_datum", false);
+  return true;
+}
+
+void hdf5_reader::fetch_datum_conduit(Mat& X, int data_id) {
+  const std::string conduit_key = LBANN_DATA_ID_STR(data_id);
+  // Create a node to hold all of the data
+  conduit::Node node;
+  if (data_store_active()) {
+    prof_region_begin("get_conduit_node", prof_colors[0], false);
+    const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
+    node.set_external(ds_node);
+    prof_region_end("get_conduit_node", false);
+  } else {
+    auto &conduit_obj = node[conduit_key + "/slab"];
+    conduit_obj.set(conduit::DataType::int16(
+        m_num_features / dc::get_number_of_io_partitions()));
+    short *sample_buf = conduit_obj.value();
+    read_hdf5_sample(data_id, sample_buf);
+    node[conduit_key + "/responses"].set(m_all_responses, 4);
+    if (priming_data_store()) {
+      // Once the node has been populated save it in the data store
+      m_data_store->set_conduit_node(data_id, node);
+    }
+  }
+  prof_region_begin("set_external", prof_colors[0], false);
+  conduit::Node slab;
+  slab.set_external(node[conduit_key + "/slab"]);
+  prof_region_end("set_external", false);
+  short *data = slab.value();
+  prof_region_begin("copy_to_buffer", prof_colors[0], false);
+  std::memcpy(X.Buffer(), data, slab.dtype().number_of_elements()*slab.dtype().element_bytes());
+  prof_region_end("copy_to_buffer", false);
+}
+
+//get from a cached response
+bool hdf5_reader::fetch_response(Mat& Y, int data_id, int mb_idx) {
+  prof_region_begin("fetch_response", prof_colors[0], false);
+  assert_eq(Y.Height(), m_num_response_features);
+  float *buf = nullptr;
+  if (data_store_active()) {
+    conduit::Node node;
+    const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
+    node.set_external(ds_node);
+    const std::string conduit_obj = LBANN_DATA_ID_STR(data_id);
+    buf = node[conduit_obj+"/responses"].value();
+  }else {
+    buf = m_all_responses;
+  }
+  std::memcpy(Y.Buffer(), buf,
+              m_num_response_features*sizeof(DataType));
+  gather_responses(Y.Buffer());
+  prof_region_end("fetch_response", false);
+  return true;
+}
+
+// Gather scattered responses to the first N ranks, where N is the
+// mini-batch size. This is not necessary when the rank reordering
+// is used.
+void hdf5_reader::gather_responses(float *responses) {
+  float recv_buf[m_num_response_features];
+  const int rank = dc::get_mpi_rank();
+  const int num_part = dc::get_number_of_io_partitions();
+  const int mini_batch_size = this->get_loaded_mini_batch_size();
+  const int src_rank = rank * num_part;
+  const int dst_rank = rank / num_part;
+  const int tag = 0;
+  int req_idx = 0;
+  MPI_Request req[2];
+
+  // send
+  if (rank % num_part == 0) {
+    MPI_Isend(responses, m_num_response_features, MPI_FLOAT, dst_rank,
+              tag, m_response_gather_comm, &req[req_idx]);
+    ++req_idx;
+  }
+
+  // recv
+  if (rank < mini_batch_size) {
+    MPI_Irecv(recv_buf, m_num_response_features, MPI_FLOAT, src_rank, tag,
+              m_response_gather_comm, &req[req_idx]);
+    ++req_idx;
+  }
+
+  if (req_idx > 0) {
+    MPI_Waitall(req_idx, req, MPI_STATUS_IGNORE);
+  }
+
+  std::memcpy(responses, recv_buf, sizeof(float) * m_num_response_features);
+}
+
+} // namespace lbann
diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp
index ef38975d8b9..57a4e0222f9 100644
--- a/src/data_store/data_store_conduit.cpp
+++ b/src/data_store/data_store_conduit.cpp
@@ -32,6 +32,7 @@
 #include "lbann/utils/exception.hpp"
 #include "lbann/utils/options.hpp"
 #include "lbann/utils/timer.hpp"
+#include "lbann/utils/distconv.hpp"
 #include "lbann/utils/file_utils.hpp"
 #include "lbann/utils/commify.hpp"
 #include <unordered_set>
@@ -61,11 +62,20 @@ data_store_conduit::data_store_conduit(
     LBANN_ERROR("m_comm is nullptr");
   }
 
+#ifdef LBANN_HAS_DISTCONV
+  int num_io_parts = dc::get_number_of_io_partitions();
+#else
+  int num_io_parts = 1;
+#endif // LBANN_HAS_DISTCONV
+
   m_world_master = m_comm->am_world_master();
   m_trainer_master = m_comm->am_trainer_master();
   m_rank_in_trainer = m_comm->get_rank_in_trainer();
   m_rank_in_world = m_comm->get_rank_in_world();
+  m_partition_in_trainer = m_rank_in_trainer/num_io_parts; // needs a better name  which group you are in
+  m_offset_in_partition = m_rank_in_trainer%num_io_parts;
   m_np_in_trainer = m_comm->get_procs_per_trainer();
+  m_num_partitions_in_trainer = m_np_in_trainer/num_io_parts; // rename this m_num_io_groups_in_trainer
 
   open_informational_files();
 
@@ -183,7 +193,11 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs) {
   m_world_master = rhs.m_world_master;
   m_trainer_master = rhs.m_trainer_master;
   m_rank_in_trainer = rhs.m_rank_in_trainer;
+  m_rank_in_world = rhs.m_rank_in_world;
+  m_partition_in_trainer = rhs.m_partition_in_trainer;
+  m_offset_in_partition = rhs.m_offset_in_partition;
   m_np_in_trainer = rhs.m_np_in_trainer;
+  m_num_partitions_in_trainer = rhs.m_num_partitions_in_trainer;
   m_owner = rhs.m_owner;
   m_shuffled_indices = rhs.m_shuffled_indices;
   m_sample_sizes = rhs.m_sample_sizes;
@@ -380,7 +394,8 @@ void data_store_conduit::set_conduit_node(int data_id, const conduit::Node &node
       {
     //    std::lock_guard<std::mutex> lock(m_mutex);
         LBANN_ERROR("NOT YET IMPLEMENTED");
-        m_owner[data_id] = m_rank_in_trainer;
+        auto key = std::make_pair(data_id, m_offset_in_partition);
+        m_owner[key] = m_rank_in_trainer;
         m_sample_sizes[data_id] = n2.total_bytes_compact();
         spill_conduit_node(node, data_id);
         m_spilled_nodes[data_id] = m_cur_spill_dir_integer;
@@ -388,13 +403,14 @@ void data_store_conduit::set_conduit_node(int data_id, const conduit::Node &node
     }
 
     else {
-      {
-      //  std::lock_guard<std::mutex> lock(m_mutex);
-        m_owner[data_id] = m_rank_in_trainer;
-        build_node_for_sending(node, m_data[data_id]);
-        m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact();
-      }
+      //      m_mutex.lock();
+      DEBUG_DS("set_conduit_node : rank_in_trainer=", m_rank_in_trainer, " and partition_in_trainer=", m_partition_in_trainer, " offset in partition=", m_offset_in_partition, " with num_partitions=", m_num_partitions_in_trainer);
+      auto key = std::make_pair(data_id, m_offset_in_partition);
+      m_owner[key] = m_rank_in_trainer;
+      build_node_for_sending(node, m_data[data_id]);
+      m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact();
       error_check_compacted_node(m_data[data_id], data_id);
+      //      m_mutex.unlock();
     }
   }
 }
@@ -616,8 +632,14 @@ int data_store_conduit::build_indices_i_will_recv(int current_pos, int mb_size)
   int k = 0;
   for (int i=current_pos; i< current_pos + mb_size; ++i) {
     auto index = (*m_shuffled_indices)[i];
-    if ((i % m_owner_map_mb_size) % m_np_in_trainer == m_rank_in_trainer) {
-      int owner = m_owner[index];
+#ifdef LBANN_HAS_DISTCONV
+    int num_ranks_in_partition = dc::get_number_of_io_partitions();
+#else
+    int num_ranks_in_partition = 1;
+#endif // LBANN_HAS_DISTCONV
+    if ((((i % m_owner_map_mb_size) % m_num_partitions_in_trainer) * num_ranks_in_partition + m_offset_in_partition) == m_rank_in_trainer) {
+      auto key = std::make_pair(index, m_offset_in_partition);
+      int owner = m_owner[key];
       m_indices_to_recv[owner].insert(index);
       k++;
     }
@@ -640,11 +662,17 @@ int data_store_conduit::build_indices_i_will_send(int current_pos, int mb_size)
       is_mine = true;
     }
     if (is_mine) {
-      m_indices_to_send[(i % m_owner_map_mb_size) % m_np_in_trainer].insert(index);
+#ifdef LBANN_HAS_DISTCONV
+      int num_ranks_in_partition = dc::get_number_of_io_partitions();
+#else
+      int num_ranks_in_partition = 1;
+#endif // LBANN_HAS_DISTCONV
+      m_indices_to_send[(((i % m_owner_map_mb_size) % m_num_partitions_in_trainer) * num_ranks_in_partition + m_offset_in_partition)].insert(index);
 
       // Sanity check
-      if (m_owner[index] != m_rank_in_trainer) {
-        LBANN_ERROR( "error for i: ", i, " index: ", index, " m_owner: ", m_owner[index], " me: ", m_rank_in_trainer);
+      auto key = std::make_pair(index, m_offset_in_partition);
+      if (m_owner[key] != m_rank_in_trainer) {
+        LBANN_ERROR( "error for i: ", i, " index: ", index, " m_owner: ", m_owner[key], " me: ", m_rank_in_trainer);
       }
       k++;
     }
@@ -663,7 +691,8 @@ void data_store_conduit::build_preloaded_owner_map(const std::vector<int>& per_r
       ++owning_rank;
       per_rank_list_range_start += per_rank_list_size;
     }
-    m_owner[(*m_shuffled_indices)[i]] = owning_rank;
+    auto key = std::make_pair((*m_shuffled_indices)[i], m_offset_in_partition);
+    m_owner[key] = owning_rank;
   }
 PROFILE("build_preloaded_owner_map; m_owner_maps_were_exchanged = true");
   m_owner_maps_were_exchanged = true;
@@ -710,10 +739,11 @@ void data_store_conduit::compact_nodes() {
 }
 
 int data_store_conduit::get_index_owner(int idx) {
-  if (m_owner.find(idx) == m_owner.end()) {
+  auto key = std::make_pair(idx, m_offset_in_partition);
+  if (m_owner.find(key) == m_owner.end()) {
     LBANN_ERROR(" idx: ", idx, " was not found in the m_owner map; map size: ", m_owner.size());
   }
-  return m_owner[idx];
+  return m_owner[key];
 }
 
 void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset) {
@@ -1324,31 +1354,52 @@ void data_store_conduit::exchange_owner_maps() {
   m_comm->all_gather(&my_count, 1, all_counts.data(), 1,  m_comm->get_trainer_comm());
 
   std::vector<size_t> my_sizes(m_my_num_indices);
+  std::vector<std::pair<size_t,size_t>> nodes_i_own(m_owner.size());
   size_t j = 0;
   for (auto t : m_owner) {
-    my_sizes[j++] = t.first;
+    auto slab_id = std::make_pair(t.first.first, t.first.second);
+    nodes_i_own[j++] = slab_id;
+    DEBUG_DS("I am building the size vector from the owner map for ", t.first.first, ".", t.first.second, " and ", t.second);
   }
 
-  std::vector<size_t> others;
+  std::vector<std::pair<size_t,size_t>> other_ranks_nodes;
   for (int k=0; k<m_np_in_trainer; k++) {
-    others.resize(all_counts[k]);
+    other_ranks_nodes.resize(all_counts[k]);
     if (m_rank_in_trainer == k) {
-      m_comm->broadcast<size_t>(k, my_sizes.data(), all_counts[k],  m_comm->get_trainer_comm());
+      m_comm->broadcast<std::pair<size_t,size_t>>(k, nodes_i_own.data(), all_counts[k],  m_comm->get_trainer_comm());
+      if(m_debug) {
+        int c = 0;
+        for(auto i : nodes_i_own) {
+          DEBUG_DS("k=", k,  ": nodes_i_own[", c, "]=", i.first, ".", i.second);
+          c++;
+        }
+      }
     } else {
-      m_comm->broadcast<size_t>(k, others.data(), all_counts[k],  m_comm->get_trainer_comm());
-      for (size_t i=0; i<others.size(); ++i) {
-        if (m_owner.find(others[i]) != m_owner.end()) {
+      m_comm->broadcast<std::pair<size_t,size_t>>(k, other_ranks_nodes.data(), all_counts[k],  m_comm->get_trainer_comm());
+      if(m_debug) {
+        int c = 0;
+        for(auto i : other_ranks_nodes) {
+          DEBUG_DS("k=", k,  ": other_ranks_nodes[", c, "]=", i.first, ".", i.second);
+          c++;
+        }
+      }
+      for (size_t i=0; i<other_ranks_nodes.size(); ++i) {
+        auto key = other_ranks_nodes[i];
+        // Check to make sure that I don't own this
+        if (m_owner.find(key) != m_owner.end()) {
 
           if (m_debug) {
-            DEBUG_DS("data_store_conduit::exchange_owner_maps, duplicate data_id: ", others[i], "; k= ", k, "\nmy current m_owner map: ");
-            for (auto t : m_owner) DEBUG_DS("data_id: ", t.first, " owner: ", t.second);
-            DEBUG_DS("\nowner map (partial or whole) from P_", k);
-            for (auto t : others) DEBUG_DS(t, " ");
+            auto slab_id = other_ranks_nodes[i];
+            DEBUG_DS("data_store_conduit::exchange_owner_maps, duplicate data_id: ", slab_id.first, ".", slab_id.second, "; k= ", k, "\nm_owner:\n");
+            for (auto t : m_owner) DEBUG_DS("data_id: ", t.first.first, " / ", t.first.second, " owner: ", t.second);
+            DEBUG_DS("\nother_ranks_nodes[k]: ");
+            for (auto t : other_ranks_nodes) DEBUG_DS(t.first, ".", t.second, " ");
           }
 
-          LBANN_ERROR("duplicate data_id: ", others[i], " role: ", m_reader->get_role(), "; m_owner[", others[i],"] = ", m_owner[others[i]], " for role: ", m_reader->get_role(), " m_owner.size: ", m_owner.size(), " m_data.size(): ", m_data.size());
+          LBANN_ERROR("duplicate data_id: ", other_ranks_nodes[i].first, ".",
+                      other_ranks_nodes[i].second, " role: ", m_reader->get_role(), "; m_owner[",other_ranks_nodes[i].first, ".", other_ranks_nodes[i].second,"] = ", m_owner[key]);
         }
-        m_owner[others[i]] = k;
+        m_owner[key] = k;
       }
     }
 
@@ -1533,9 +1584,9 @@ void data_store_conduit::test_checkpoint(const std::string &checkpoint_dir) {
   //check that the owner map was correctly loaded
   for (auto t : m_owner) {
     if (sanity.find(t.first) == sanity.end()) {
-      LBANN_ERROR("sanity.find(t.first) == sanity.end() for t.first= ", t.first);
+      LBANN_ERROR("sanity.find(t.first) == sanity.end() for t.first= ", t.first.first, ":", t.first.second);
     } else if (sanity[t.first] != m_owner[t.first]) {
-      LBANN_ERROR("sanity[t.first] != m_owner[t.first] for t.first= ", t.first, " and m_owner[t.first]= ", m_owner[t.first]);
+      LBANN_ERROR("sanity[t.first] != m_owner[t.first] for t.first= ", t.first.first, ":", t.first.second, " and m_owner[t.first]= ", m_owner[t.first]);
     }
   }
 
@@ -1663,6 +1714,12 @@ void data_store_conduit::load_checkpoint(std::string dir_name, generic_data_read
            m_owner, m_sample_sizes);
 
   if (reader != nullptr) {
+#ifdef LBANN_HAS_DISTCONV
+    int num_io_parts = dc::get_number_of_io_partitions();
+#else
+    int num_io_parts = 1;
+#endif // LBANN_HAS_DISTCONV
+
     m_reader = reader;
     m_comm = m_reader->get_comm();
     m_shuffled_indices = &(m_reader->get_shuffled_indices());
@@ -1670,7 +1727,10 @@ void data_store_conduit::load_checkpoint(std::string dir_name, generic_data_read
     m_trainer_master = m_comm->am_trainer_master();
     m_rank_in_trainer = m_comm->get_rank_in_trainer();
     m_rank_in_world = m_comm->get_rank_in_world();
+    m_partition_in_trainer = m_rank_in_trainer/num_io_parts; // needs a better name  which group you are in
+    m_offset_in_partition = m_rank_in_trainer%num_io_parts;
     m_np_in_trainer = m_comm->get_procs_per_trainer();
+    m_num_partitions_in_trainer = m_np_in_trainer/num_io_parts; // rename this m_num_io_groups_in_trainer
   }
 
   // Open metadata filename; this is in index re, checkpointed conduit filenames
@@ -1804,14 +1864,14 @@ void data_store_conduit::open_informational_files() {
 }
 
 void data_store_conduit::print_partial_owner_map(int n) {
-   std::cout << "\nHere is part of the owner map; m_owner.size(): " << m_owner.size() << std::endl;
-  std::map<int,int> m;
+  std::cout << "\nHere is part of the owner map; m_owner.size(): " << m_owner.size() << std::endl;
+  std::map<std::pair<size_t,size_t>, int> m;
   for (auto t : m_owner) {
     m[t.first] = t.second;
   }
   int j = 0;
   for (auto t : m) {
-    std::cout << "  sample_id: " << t.first << " owner: " << t.second << std::endl;
+    std::cout << "  sample_id: " << t.first.first << ":" << t.first.second << " owner: " << t.second << std::endl;
     if (j++ >= 10) break;
   }
 }
diff --git a/src/io/data_buffers/partitioned_io_buffer.cpp b/src/io/data_buffers/partitioned_io_buffer.cpp
index 14495a8463a..8609adf4558 100644
--- a/src/io/data_buffers/partitioned_io_buffer.cpp
+++ b/src/io/data_buffers/partitioned_io_buffer.cpp
@@ -26,6 +26,8 @@
 
 #include "lbann/io/data_buffers/partitioned_io_buffer.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/profiling.hpp"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
@@ -69,6 +71,9 @@ partitioned_io_buffer<TensorDataType>& partitioned_io_buffer<TensorDataType>::op
 
 template <typename TensorDataType>
 void partitioned_io_buffer<TensorDataType>::fp_setup_data(El::Int cur_mini_batch_size, int idx) {
+#ifdef LBANN_HAS_DISTCONV
+  cur_mini_batch_size *= dc::get_number_of_io_partitions();
+#endif
   for (auto& buf : m_data_buffers) {
     buf.second->m_input_buffers[idx]->Resize(buf.second->m_input_buffers[idx]->Height(), cur_mini_batch_size);
   }
@@ -76,8 +81,23 @@ void partitioned_io_buffer<TensorDataType>::fp_setup_data(El::Int cur_mini_batch
 
 template <typename TensorDataType>
 void partitioned_io_buffer<TensorDataType>::setup_data(El::Int num_neurons, El::Int num_targets, El::Int max_mini_batch_size) {
+#ifdef LBANN_HAS_DISTCONV
+  if (dc::is_cosmoflow_parallel_io_enabled()) {
+    num_neurons /= dc::get_number_of_io_partitions();
+    // TensorDataType is assumed to be 2-byte integer types such as
+    // short or int16_t.
+    assert_eq(sizeof(TensorDataType), sizeof(short));
+    max_mini_batch_size *= dc::get_number_of_io_partitions();
+  }
+#endif // LBANN_HAS_DISTCONV
   El::Int local_mini_batch_size = max_mini_batch_size / this->m_comm->get_procs_per_trainer();
   El::Int partial_mini_batch_size = max_mini_batch_size % this->m_comm->get_procs_per_trainer();
+#ifdef LBANN_HAS_DISTCONV
+  if (dc::is_cosmoflow_parallel_io_enabled()) {
+    assert_eq(local_mini_batch_size, 1);
+    assert_eq(partial_mini_batch_size, 0);
+  }
+#endif // LBANN_HAS_DISTCONV
   if(partial_mini_batch_size > 0 && this->m_comm->get_rank_in_trainer() < partial_mini_batch_size) {
     local_mini_batch_size++;
   }
@@ -104,6 +124,7 @@ template <typename TensorDataType>
 int partitioned_io_buffer<TensorDataType>::fetch_to_local_matrix(generic_data_reader *data_reader, execution_mode mode) {
   int num_parallel_readers = data_reader->get_num_parallel_readers();
 
+  prof_region_begin("fetch_to_local_matrix", prof_colors[2], false);
   /// Coordinate all available readers so that the perform I/O in the same step
   /// Check to make sure that the local matrix has space for data
   data_buffer<IODataType> *buf = get_data_buffer(mode);
@@ -121,15 +142,24 @@ int partitioned_io_buffer<TensorDataType>::fetch_to_local_matrix(generic_data_re
       //      m_num_data_per_epoch+=num_samples_fetched; /// BVE FIXME need to change how this is shared
     }
   }
+  prof_region_end("fetch_to_local_matrix", false);
   return buf->m_num_samples_fetched;
 }
 
 template <typename TensorDataType>
 void partitioned_io_buffer<TensorDataType>::distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample, AbsDistMatrixType& response) {
+  prof_region_begin("distribute_from_local_matrix", prof_colors[3], false);
   data_buffer<IODataType> *buf = get_data_buffer(mode);
   Copy(*buf->m_input_buffers[0], sample);
   Copy(*buf->m_input_buffers[1], response);
+#ifdef LBANN_HAS_DISTCONV
+  if (dc::is_cosmoflow_parallel_io_enabled()) {
+    response.Resize(response.Height(), response.Width() /
+                    dc::get_number_of_io_partitions());
+  }
+#endif
   buf->m_num_samples_fetched = 0;
+  prof_region_end("distribute_from_local_matrix", false);
   return;
 }
 
diff --git a/src/layers/data_type_distconv_adapter.cpp b/src/layers/data_type_distconv_adapter.cpp
index 56163f0bbcb..a5d20aeee1d 100644
--- a/src/layers/data_type_distconv_adapter.cpp
+++ b/src/layers/data_type_distconv_adapter.cpp
@@ -28,6 +28,7 @@
 #include "lbann/layers/data_type_layer.hpp"
 #include "lbann/models/model.hpp"
 #include "lbann/execution_contexts/sgd_execution_context.hpp"
+#include "lbann/trainers/trainer.hpp"
 
 namespace lbann {
 
@@ -345,7 +346,7 @@ dc::Shape data_type_distconv_adapter<TensorDataType>::get_prev_activations_shape
     int input_index) const {
   const auto input_dims = layer().get_input_dims(input_index);
   std::vector<int> input_tensor_shape_v(input_dims.rbegin(), input_dims.rend());
-  input_tensor_shape_v.push_back(layer().get_model()->get_max_mini_batch_size());
+  input_tensor_shape_v.push_back(get_max_mini_batch_size());
   return dc::Shape(input_tensor_shape_v);
 }
 
@@ -361,7 +362,7 @@ dc::Shape data_type_distconv_adapter<TensorDataType>::get_activations_shape(
     int output_index) const {
   const auto output_dims = layer().get_output_dims(output_index);
   std::vector<int> output_tensor_shape_v(output_dims.rbegin(), output_dims.rend());
-  output_tensor_shape_v.push_back(layer().get_model()->get_max_mini_batch_size());
+  output_tensor_shape_v.push_back(get_max_mini_batch_size());
   return dc::Shape(output_tensor_shape_v);
 }
 
@@ -700,12 +701,13 @@ dc::TensorShuffler<TensorDataType> &get_shuffler(
     const Layer &layer,
     std::array<dc::TensorShuffler<TensorDataType>*, 4> &shufflers,
     const dc::TensorDev<TensorDataType> &src,
-    const dc::TensorDev<TensorDataType> &dst) {
+    const dc::TensorDev<TensorDataType> &dst,
+    const size_t max_mini_batch_size) {
   const auto& c = static_cast<sgd_execution_context&>(
       layer.get_model()->get_execution_context());
   const auto& mini_batch_size = c.get_current_mini_batch_size();
   int shuffler_idx = -1;
-  if (layer.get_model()->get_max_mini_batch_size() == mini_batch_size) {
+  if (max_mini_batch_size == mini_batch_size) {
     shuffler_idx = 0;
   } else {
     // The last remaining mini-batches for the train, validation, and
@@ -727,28 +729,32 @@ template <typename TensorDataType>
 dc::TensorShuffler<TensorDataType>& data_type_distconv_adapter<TensorDataType>::
 get_prev_activations_shuffler(
     const dc::TensorDev<TensorDataType> &src, const dc::TensorDev<TensorDataType> &dst) {
-  return get_shuffler(layer(), m_prev_activations_shufflers, src, dst);
+  return get_shuffler(layer(), m_prev_activations_shufflers, src, dst,
+                      get_max_mini_batch_size());
 }
 
 template <typename TensorDataType>
 dc::TensorShuffler<TensorDataType>& data_type_distconv_adapter<TensorDataType>::
 get_activations_shuffler(
     const dc::TensorDev<TensorDataType> &src, const dc::TensorDev<TensorDataType> &dst) {
-  return get_shuffler(layer(), m_activations_shufflers, src, dst);
+  return get_shuffler(layer(), m_activations_shufflers, src, dst,
+                      get_max_mini_batch_size());
 }
 
 template <typename TensorDataType>
 dc::TensorShuffler<TensorDataType>& data_type_distconv_adapter<TensorDataType>::
 get_prev_error_signals_shuffler(
     const dc::TensorDev<TensorDataType> &src, const dc::TensorDev<TensorDataType> &dst) {
-  return get_shuffler(layer(), m_prev_error_signals_shufflers, src, dst);
+  return get_shuffler(layer(), m_prev_error_signals_shufflers, src, dst,
+                      get_max_mini_batch_size());
 }
 
 template <typename TensorDataType>
 dc::TensorShuffler<TensorDataType>& data_type_distconv_adapter<TensorDataType>::
 get_error_signals_shuffler(
     const dc::TensorDev<TensorDataType> &src, const dc::TensorDev<TensorDataType> &dst) {
-  return get_shuffler(layer(), m_error_signals_shufflers, src, dst);
+  return get_shuffler(layer(), m_error_signals_shufflers, src, dst,
+                      get_max_mini_batch_size());
 }
 
 template <typename TensorDataType>
@@ -878,6 +884,11 @@ void data_type_distconv_adapter<TensorDataType>::dump_original_error_signals() {
                   get_name() +  "_error_signals_original");
 }
 
+template <typename TensorDataType>
+size_t data_type_distconv_adapter<TensorDataType>::get_max_mini_batch_size() const {
+  return layer().get_model()->get_max_mini_batch_size_distconv();
+}
+
 #define PROTO(T)                                \
   template class data_type_distconv_adapter<T>
 
diff --git a/src/layers/io/input/input_layer.cpp b/src/layers/io/input/input_layer.cpp
index eca519118da..f2e3fe3bda5 100644
--- a/src/layers/io/input/input_layer.cpp
+++ b/src/layers/io/input/input_layer.cpp
@@ -27,6 +27,9 @@
 #define LBANN_INPUT_LAYER_INSTANTIATE
 #include "lbann/layers/io/input/input_layer.hpp"
 #include "lbann/utils/profiling.hpp"
+#ifdef LBANN_HAS_DISTCONV
+#include "lbann/data_readers/data_reader_hdf5.hpp"
+#endif // LBANN_HAS_DISTCONV
 
 namespace lbann {
 
@@ -41,6 +44,18 @@ input_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>
   for (int i = 0; i < layer.get_num_children(); ++i) {
     m_is_input_processed.push_back(layer.get_child_layers()[i]->distconv_enabled());
   }
+  auto &l = dynamic_cast<input_layer<
+    TensorDataType, T_io_buffer, T_layout, Dev>&>(this->layer());
+  // TODO: hdf5_reader is assumed to return a sub-sample partitioned
+  // in the same way as specified by the parallel strategy of this input
+  // layer. Other data readers are assumed to return a complete
+  // sample, thus shuffling is required (unless sample-parallel
+  // strategy is given). Conceptually, it seems to make sense if a
+  // data reader is annotated with a parallel strategy. Note that,
+  // when the HDF5 data reader is used, it is assumed that it is used
+  // in all execution modes.
+  auto training_dr = l.get_data_reader(execution_mode::training);
+  m_shuffle_required = dynamic_cast<hdf5_reader*>(training_dr) == nullptr;
   if (m_shuffle_required) {
     m_shufflers.resize(layer.get_num_children());
   }
diff --git a/src/layers/learning/base_convolution.cpp b/src/layers/learning/base_convolution.cpp
index 151b385e0bf..3e98cf5017e 100644
--- a/src/layers/learning/base_convolution.cpp
+++ b/src/layers/learning/base_convolution.cpp
@@ -1386,6 +1386,14 @@ void base_convolution_adapter<TensorDataType, Device>::bp_compute_convolution_fi
     m_kernel_gradient->scale(dst_scale, hydrogen::cuda::GetDefaultStream());
   }
 }
+
+
+#define PROTO_DEVICE(T, Device)                                            \
+  template class base_convolution_adapter<T, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
 #endif // LBANN_HAS_DISTCONV
 
 #define PROTO_DEVICE(T, Device)                                            \
diff --git a/src/models/model.cpp b/src/models/model.cpp
index a61bfd13525..ce0ade8087e 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -594,6 +594,7 @@ void model::setup(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata) {
   }
 
 #ifdef LBANN_HAS_DISTCONV
+  m_max_mini_batch_size_distconv = max_mini_batch_size;
   setup_distconv();
 #endif
 
diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp
index 6955e4e0a84..aee65bb7760 100644
--- a/src/proto/proto_common.cpp
+++ b/src/proto/proto_common.cpp
@@ -183,6 +183,17 @@ void init_data_readers(
       reader_numpy_npz->set_has_responses(!readme.disable_responses());
       reader_numpy_npz->set_scaling_factor_int16(readme.scaling_factor_int16());
       reader = reader_numpy_npz;
+#ifdef LBANN_HAS_DISTCONV
+    } else if (name=="cosmoflow_hdf5") {
+      auto* reader_cosmo_hdf5 = new hdf5_reader(shuffle);
+      auto filedir = readme.data_filedir();
+      if(!endsWith(filedir, "/")) {
+        filedir = filedir + "/";
+      }
+      const auto paths = glob(filedir +readme.data_file_pattern());
+      reader_cosmo_hdf5->set_hdf5_paths(paths);
+      reader = reader_cosmo_hdf5;
+#endif // LBANN_HAS_DISTCONV
     } else if (name == "pilot2_molecular_reader") {
       pilot2_molecular_reader* reader_pilot2_molecular = new pilot2_molecular_reader(readme.num_neighbors(), readme.max_neighborhood(), shuffle);
       reader = reader_pilot2_molecular;

From b5eef7c501c79020a8a53ed9bdea396b5c0087ef Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Mon, 7 Sep 2020 19:55:29 -0700
Subject: [PATCH 24/36] Updated distconv adaptors to use new optimizer and
 weight access methods. (#1621)

---
 src/layers/learning/base_convolution.cpp       | 7 ++++---
 src/layers/regularizers/batch_normalization.cu | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/layers/learning/base_convolution.cpp b/src/layers/learning/base_convolution.cpp
index 3e98cf5017e..c9806f2fcc8 100644
--- a/src/layers/learning/base_convolution.cpp
+++ b/src/layers/learning/base_convolution.cpp
@@ -1280,13 +1280,14 @@ void base_convolution_adapter<TensorDataType, Device>::setup_bp_tensors() {
 
   m_kernel_gradient = make_unique<TensorDevType>(kernel_shape, loc, shared_dist);
   // Gradient buffer is needed for auto-tuning the bp filter algorithm
+  auto* kernel_optimizer = static_cast<data_type_optimizer<TensorDataType>*>(l.get_weights(0).get_optimizer());
   assert0(dc::tensor::View(
             *m_kernel_gradient,
-            l.get_weights(0).get_optimizer()->get_gradient().Buffer()));
+            kernel_optimizer->get_gradient().Buffer()));
 
   // Bias tensor. Shared by all procs
   if (l.m_bias_scaling_factor != TensorDataType(0)) {
-    auto* bias_optimizer = l.get_weights(1).get_optimizer();
+    auto* bias_optimizer = static_cast<data_type_optimizer<TensorDataType>*>(l.get_weights(1).get_optimizer());
     if (bias_optimizer != nullptr) {
       dc::Shape bias_shape(dc::get_num_dims(l), 1);
       bias_shape[dc::get_channel_dim()] = l.get_output_dims()[0];
@@ -1295,7 +1296,7 @@ void base_convolution_adapter<TensorDataType, Device>::setup_bp_tensors() {
       // which is set when its view is set.
       assert0(dc::tensor::View(
                 *m_bias_gradient,
-                l.get_weights(1).get_optimizer()->get_gradient().Buffer()));
+                bias_optimizer->get_gradient().Buffer()));
     }
   }
 }
diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu
index 2683e8458f1..abbb8bf11b3 100644
--- a/src/layers/regularizers/batch_normalization.cu
+++ b/src/layers/regularizers/batch_normalization.cu
@@ -360,9 +360,9 @@ void batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>::fp_com
   const bool is_training =
       l.m_model->get_execution_context().get_execution_mode() == execution_mode::training;
   auto& local_running_mean =
-    ValuesGetter::mutable_values(this->get_weights(2)).Matrix();
+    ValuesGetter::mutable_values(l.get_weights(2)).Matrix();
   auto& local_running_var =
-    ValuesGetter::mutable_values(this->get_weights(3)).Matrix();
+    ValuesGetter::mutable_values(l.get_weights(3)).Matrix();
 
   assert0(dc::tensor::View(
       m_scale, l.weights_values(0).LockedMatrix().LockedBuffer()));

From 235ef9f300788dbfa8e8141a76e6aacf79152548 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <mwyatt@udel.edu>
Date: Tue, 8 Sep 2020 10:49:24 -0700
Subject: [PATCH 25/36] fixed variant name for LBANN spack install command
 (#1623)

Co-authored-by: Michael Wyatt <wyatt5@llnl.gov>
---
 scripts/install_lbann.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/install_lbann.sh b/scripts/install_lbann.sh
index 670a71332c5..8778bd17b89 100755
--- a/scripts/install_lbann.sh
+++ b/scripts/install_lbann.sh
@@ -61,7 +61,7 @@ SPACK_ARCH_TARGET=$(spack arch -t)
 SCRIPT=$(basename ${BASH_SOURCE})
 BUILD_DIR=${LBANN_HOME}/build/spack
 ENABLE_GPUS=ON
-GPU_VARIANTS="+gpu+nccl"
+GPU_VARIANTS="+cuda+nccl"
 ENABLE_HALF=OFF
 HALF_VARIANTS="~half"
 BUILD_TYPE=Release

From 621a54d147a318ad9573ae016636a509d2da5320 Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Tue, 15 Sep 2020 10:07:54 -0700
Subject: [PATCH 26/36] GRU layer (#1626)

* Implement forward prop for GPU GRU layer

Added wrapper classes for cuDNN objects. Metric checking passes.

* Progress on implementing backprop for GPU GRU layer

Computes input grad, but not weights grad. Passes gradient checking.

* Implement backprop for GPU GRU layer

Gradient checking passes.

* Update ATOM VAE model to use GRU layer

* Optimize performance of ATOM VAE model

Remove argmax layers. Move eval layers to CPU.

* Add documentation for GRU layer

Fixed compilation layers on CPU (still no CPU GRU layer). Disabled Bamboo test for GRU layer.

* Implement GRU module for ATOM VAE model

* Fix compilation errors with cuDNN 8

* Remove mixed-precision training from ATOM VAE model

* Modify LTFB so NaNs lose tournaments

* Improve numerical stability of reconstruction loss in ATOM VAE model

* Minor tweaks in ATOM VAE model
---
 applications/ATOM/models/vae.py              | 328 +++++--
 applications/ATOM/train_atom_vae.py          |  10 +-
 bamboo/unit_tests/test_unit_layer_gru.py     | 236 +++++
 include/lbann/layers/learning/CMakeLists.txt |   1 +
 include/lbann/layers/learning/gru.hpp        | 130 +++
 include/lbann/lbann.hpp                      |   1 +
 include/lbann/utils/cudnn.hpp                | 223 +++++
 python/lbann/modules/rnn.py                  |   9 +-
 src/callbacks/ltfb.cpp                       |   5 +-
 src/layers/learning/CMakeLists.txt           |   1 +
 src/layers/learning/gru.cpp                  | 956 +++++++++++++++++++
 src/proto/factories/layer_factory.cpp        |   2 +
 src/proto/layers.proto                       |  23 +
 src/utils/cudnn.cpp                          | 440 +++++++++
 14 files changed, 2287 insertions(+), 78 deletions(-)
 create mode 100644 bamboo/unit_tests/test_unit_layer_gru.py
 create mode 100644 include/lbann/layers/learning/gru.hpp
 create mode 100644 src/layers/learning/gru.cpp

diff --git a/applications/ATOM/models/vae.py b/applications/ATOM/models/vae.py
index 64358b6d0b5..f7a6e180cfd 100644
--- a/applications/ATOM/models/vae.py
+++ b/applications/ATOM/models/vae.py
@@ -1,12 +1,92 @@
+import math
 import lbann
 import lbann.modules
-from math import sqrt
 from lbann.util import make_iterable
 
 def str_list(l):
     """Convert an iterable object to a space-separated string."""
     return ' '.join(str(i) for i in make_iterable(l))
 
+class GRUModule(lbann.modules.Module):
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(
+        self,
+        hidden_size,
+        num_layers=1,
+        weights=[],
+        name=None,
+        device=None,
+        datatype=None,
+        weights_datatype=None,
+    ):
+        GRUModule.global_count += 1
+        self.instance = 0
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.name = name if name else f'gru{GRUModule.global_count}'
+        self.device = device
+        self.datatype = datatype
+
+        # Construct weights if needed
+        self.weights = weights
+        if not self.weights:
+            scale = 1 / math.sqrt(self.hidden_size)
+            init = lbann.UniformInitializer(min=-scale,max=scale)
+            if weights_datatype is None:
+                weights_datatype = self.datatype
+            self.weights = []
+            for i in range(self.num_layers):
+                self.weights.extend(
+                    lbann.Weights(
+                        initializer=init,
+                        name=f'{self.name}_layer{i}_{weight_name}',
+                        datatype=weights_datatype,
+                    )
+                    for weight_name in ('ih_matrix', 'hh_matrix', 'ih_bias', 'hh_bias')
+                )
+        if self.weights and len(self.weights) != 4*self.num_layers:
+            raise ValueError(
+                f'expected {4*self.num_layers} weights, '
+                f'but recieved {len(self.weights)}'
+            )
+
+        # Default initial hidden state
+        self.zeros = lbann.Constant(
+            value=0,
+            num_neurons=str(hidden_size),
+            name=f'{self.name}_zeros',
+            device=self.device,
+            datatype=self.datatype,
+        )
+
+    def forward(self, x, h=None):
+        self.instance += 1
+        name = f'{self.name}_instance{self.instance}'
+
+        # Initial hidden state
+        if not h:
+            h = [self.zeros] * self.num_layers
+        if not isinstance(h, list) or len(h) != self.num_layers:
+            raise ValueError(
+                f'expected `h` to be a list with {self.num_layers} layers'
+            )
+
+        # Stacked GRU
+        ### @todo Replace with single GRU once LBANN supports stacked GRUs
+        for i in range(self.num_layers):
+            x = lbann.GRU(
+                x,
+                h[i],
+                hidden_size=self.hidden_size,
+                name=f'{name}_layer{i}',
+                weights=self.weights[4*i:4*(i+1)],
+                device=self.device,
+                datatype=self.datatype,
+            )
+        return x
+
 class MolVAE(lbann.modules.Module):
     """Molecular VAE.
 
@@ -38,23 +118,45 @@ def __init__(self, input_feature_dims,dictionary_size, embedding_size, ignore_la
         self.embedding_size = embedding_size
         self.dictionary_size = dictionary_size
         self.label_to_ignore = ignore_label
+        self.datatype = lbann.DataType.FLOAT
+        self.weights_datatype = lbann.DataType.FLOAT
 
         fc = lbann.modules.FullyConnectedModule
-        gru = lbann.modules.GRU
+        gru = GRUModule
+
         #Encoder
-        winit = lbann.GlorotNormalInitializer()
-        self.encoder_rnn = gru(size=256, name=self.name+'_encoder_rnn')
+        self.encoder_rnn = gru(
+            hidden_size=256,
+            name=self.name+'_encoder_rnn',
+            datatype=self.datatype,
+            weights_datatype=self.weights_datatype,
+        )
         self.q_mu = fc(128,name=self.name+'_qmu')
         self.q_logvar = fc(128,name=self.name+'_qlogvar')
+        for w in self.q_mu.weights + self.q_logvar.weights:
+            w.datatype = self.weights_datatype
+
         #Decoder
-        self.decoder_rnn0 = gru(size=512, name=self.name+'_decoder_rnn0')
-        self.decoder_rnn1 = gru(size=512, name=self.name+'_decoder_rnn1')
-        self.decoder_rnn2 = gru(size=512, name=self.name+'_decoder_rnn2')
-        self.decoder_lat = fc(512,name=self.name+'_decoder_lat')
-        self.decoder_fc = fc(dictionary_size,name=self.name+'_decoder_fc')
-        #shared encoder/decodeer weights
-        self.emb_weights = lbann.Weights(initializer=lbann.NormalInitializer(mean=0, standard_deviation=1),
-                                   name='emb_matrix')
+        self.decoder_rnn = gru(
+            hidden_size=512,
+            num_layers=3,
+            name=self.name+'_decoder_rnn',
+            datatype=self.datatype,
+            weights_datatype=self.weights_datatype,
+        )
+        self.decoder_lat = fc(512, name=self.name+'_decoder_lat')
+        self.decoder_fc = fc(self.dictionary_size, name=self.name+'_decoder_fc')
+        for w in self.decoder_lat.weights + self.decoder_fc.weights:
+            w.datatype = self.weights_datatype
+        self.decoder_fc.weights[0].initializer = lbann.NormalInitializer(
+            mean=0, standard_deviation=1/math.sqrt(512))
+
+        #shared encoder/decoder weights
+        self.emb_weights = lbann.Weights(
+            initializer=lbann.NormalInitializer(mean=0, standard_deviation=1),
+            name='emb_matrix',
+            datatype=self.weights_datatype,
+        )
 
     def forward(self, x):
         """Do the VAE forward step
@@ -64,40 +166,63 @@ def forward(self, x):
         :return: float, recon component of loss
         """
 
-        emb = lbann.Embedding(x,
-                              num_embeddings=self.dictionary_size,
-                              embedding_dim=self.embedding_size,
-                              name='emb',
-                              weights=self.emb_weights)
-        emb_slice = lbann.Slice(emb,
-                                axis=0,
-                                slice_points=str_list(range(self.input_feature_dims+1)),
-                                name='emb_slice')
-        emb_list = [lbann.Reshape(emb_slice, dims='-1', name='emb'+str(i))
-                    for i in range(self.input_feature_dims)]
+        x = lbann.Slice(x, slice_points=str_list([0, self.input_feature_dims]))
+        x = lbann.Identity(x)
+        x_emb = lbann.Embedding(
+            x,
+            num_embeddings=self.dictionary_size,
+            embedding_dim=self.embedding_size,
+            name='emb',
+            weights=self.emb_weights
+        )
 
         # Encoder: x -> z, kl_loss
-        z, kl_loss = self.forward_encoder(emb_list)
+        z, kl_loss = self.forward_encoder(x_emb)
 
         # Decoder: x, z -> recon_loss
-        recon_loss, arg_max = self.forward_decoder(x, emb_list, z)
+        pred = self.forward_decoder(x_emb, z)
+        recon_loss = self.compute_loss(x, pred)
+
+        # Hack to remove blocking GPU allreduce in evaluation layer
+        kl_loss = lbann.Identity(kl_loss, device='CPU')
+        recon_loss = lbann.Identity(recon_loss, device='CPU')
 
-        return kl_loss, recon_loss, arg_max
+        return kl_loss, recon_loss
 
-    def forward_encoder(self, emb_list):
+    def forward_encoder(self, x_emb):
         """Encoder step, emulating z ~ E(x) = q_E(z|x)
 
-        :param embed_list: list of tensors of floats, input sentence emb_list
+        :param x_emb: (n_batch, len(x), d_z) of floats, embeddings for input sentence x
         :return: (n_batch, d_z) of floats, sample of latent vector z
         :return: float, kl term component of loss
         """
 
-        h = lbann.Constant(value=0.0, num_neurons='256')
-        for i in range(self.input_feature_dims):
-            _, h = self.encoder_rnn(emb_list[i], h)
+        # _, h = self.encoder_rnn(x, None)
+        h = self.encoder_rnn(x_emb, None)
+
+        h = lbann.Slice(
+            h,
+            slice_points=str_list([self.input_feature_dims-1,
+                                   self.input_feature_dims]),
+            axis=0,
+        )
+        h = lbann.Identity(h)
 
         mu, logvar = self.q_mu(h), self.q_logvar(h)
 
+        # Set datatype of previous layers
+        # Note: Depth-first search from mu and logvar to x_emb
+        stack = [mu, logvar]
+        in_stack = {l : True for l in stack}
+        while stack:
+            l = stack.pop()
+            if type(l) not in (lbann.Slice, lbann.Reshape, lbann.Tessellate):
+                l.datatype = self.datatype
+            for parent in l.parents:
+                if parent not in in_stack and parent is not x_emb:
+                    stack.append(parent)
+                    in_stack[parent] = True
+
         # eps = torch.randn_like(mu)
         eps = lbann.Gaussian(mean=0, stdev=1,hint_layer=mu)
 
@@ -115,50 +240,115 @@ def forward_encoder(self, emb_list):
 
         return z, kl_loss
 
-    def forward_decoder(self, x, emb_list, z):
+    def forward_decoder(self, x_emb, z):
         """Decoder step, emulating x ~ G(z)
 
-        :param x: list of tensors of longs, input sentence x
-        :param emb_list: embeddings of x
+        :param x_emb: (n_batch, len(x), d_z) of floats, embeddings for input sentence x
         :param z: (n_batch, d_z) of floats, latent vector z
         :return: float, recon component of loss
+        :return: list of ints, reconstructed sentence
         """
 
+        # z_0 = z.unsqueeze(1).repeat(1, x_emb.size(1), 1)
+        # x_input = torch.cat([x_emb, z_0], dim=-1)
+        z_0 = lbann.Tessellate(
+            lbann.Reshape(z, dims=str_list([1, 128])),
+            dims=str_list([self.input_feature_dims, 128]),
+        )
+        x_input = lbann.Concatenation(x_emb, z_0, axis=1)
+
+        h_0 = self.decoder_lat(z)
+
+        # output, _ = self.decoder_rnn(x_input, h_0)
+        output = self.decoder_rnn(x_input, [h_0, h_0, h_0])
+
+        # y = self.decoder_fc(output)
+        y = lbann.ChannelwiseFullyConnected(
+            output,
+            output_channel_dims=self.dictionary_size,
+            bias=True,
+            name=f'{self.decoder_fc.name}',
+            weights=self.decoder_fc.weights,
+        )
+
+        # Set datatype of layers
+        # Note: Depth-first search from y to x_emb and z
+        stack = [y]
+        in_stack = {l : True for l in stack}
+        while stack:
+            l = stack.pop()
+            if type(l) not in (lbann.Slice, lbann.Reshape, lbann.Tessellate):
+                l.datatype = self.datatype
+            for parent in l.parents:
+                if parent not in in_stack and parent not in (x_emb, z):
+                    stack.append(parent)
+                    in_stack[parent] = True
+
+        return y
+
+    def compute_loss(self, x, y):
+
+        # y[:, :-1]
+        y = lbann.Slice(
+            y,
+            axis=0,
+            slice_points=str_list([0, self.input_feature_dims-1]),
+        )
+        y = lbann.Identity(y)
+
         # x[:, 1:]
-        xshift = lbann.Slice(x, slice_points=str_list([1, self.input_feature_dims]))
-        xshift = lbann.Identity(xshift)
-        xshift_slice = lbann.Slice(xshift, slice_points=str_list(range(self.input_feature_dims)))
-        xshift_list = [lbann.Identity(xshift_slice) for i in range(self.input_feature_dims-1)]
-
-        # Unroll RNN
-        h = [self.decoder_lat(z)] * 3
-        recon_loss = []
-        arg_max = []
-        for i in range(self.input_feature_dims-1):
-
-            # RNN stack
-            x_input = lbann.Concatenation(emb_list[i], z)
-            _, h[0] = self.decoder_rnn0(x_input, h[0])
-            _, h[1] = self.decoder_rnn1(h[0], h[1])
-            _, h[2] = self.decoder_rnn2(h[1], h[2])
-            output = h[2]
-            #output = h[0]
-            y = self.decoder_fc(output)
-            arg_max.append(lbann.Argmax(y,device='CPU'))
-
-            # Cross entropy loss
-            y = lbann.Softmax(y)
-            xshift_onehot = lbann.OneHot(xshift_list[i], size=self.dictionary_size)
-            recon_loss.append(lbann.CrossEntropy(y, xshift_onehot))
-
-        # Average cross entropy over sequence length
-        pad_mask = lbann.NotEqual(xshift,
-                                  lbann.Constant(value=self.label_to_ignore, hint_layer=xshift))
-        length = lbann.Reduction(pad_mask, mode='sum')
-        length = lbann.Max(length, lbann.Constant(value=1, num_neurons="1"))
-        recon_loss = lbann.Concatenation(recon_loss)
-        recon_loss = lbann.Multiply(recon_loss, pad_mask)
-        recon_loss = lbann.Reduction(recon_loss, mode='sum')
+        x = lbann.Slice(
+            x,
+            slice_points=str_list([1, self.input_feature_dims]),
+        )
+        x = lbann.Identity(x)
+
+        # Convert indices in x to one-hot representation
+        # Note: Ignored indices result in zero vectors
+        ignore_mask = lbann.Equal(
+            x,
+            lbann.Constant(value=self.label_to_ignore, hint_layer=x),
+        )
+        keep_mask = lbann.LogicalNot(ignore_mask)
+        length = lbann.Reduction(keep_mask, mode='sum')
+        length = lbann.Max(
+            length,
+            lbann.Constant(value=1, num_neurons=str_list([1])),
+        )
+        x = lbann.Add(
+            lbann.Multiply(keep_mask, x),
+            lbann.Multiply(ignore_mask, lbann.Constant(value=-1, hint_layer=x)),
+        )
+        x = lbann.Slice(x, slice_points=str_list(range(self.input_feature_dims)))
+        x = [lbann.Identity(x) for _ in range(self.input_feature_dims-1)]
+        x = [lbann.OneHot(xi, size=self.dictionary_size) for xi in x]
+        x = [lbann.Reshape(xi, dims=str_list([1, self.dictionary_size])) for xi in x]
+        x = lbann.Concatenation(x, axis=0)
+
+        # recon_loss = F.cross_entropy(
+        #     y[:, :-1].contiguous().view(-1, y.size(-1)),
+        #     x[:, 1:].contiguous().view(-1),
+        #     ignore_index=self.pad
+        # )
+        z = lbann.MatMul(
+            lbann.Exp(y),
+            lbann.Constant(
+                value=1,
+                num_neurons=str_list([self.dictionary_size, 1]),
+            ),
+        )
+        z = lbann.Log(z)
+        z = lbann.MatMul(
+            lbann.Reshape(keep_mask, dims=str_list([1, -1])),
+            z,
+        )
+        recon_loss = lbann.MatMul(
+            lbann.Reshape(y, dims=str_list([-1, 1])),
+            lbann.Reshape(x, dims=str_list([-1, 1])),
+            transpose_a=True,
+        )
+        recon_loss = lbann.Subtract(z, recon_loss)
+        recon_loss = lbann.Reshape(recon_loss, dims=str_list([1]))
         recon_loss = lbann.Divide(recon_loss, length)
 
-        return recon_loss, arg_max
+        return recon_loss
diff --git a/applications/ATOM/train_atom_vae.py b/applications/ATOM/train_atom_vae.py
index afbfdbf760b..1adf144e9bf 100644
--- a/applications/ATOM/train_atom_vae.py
+++ b/applications/ATOM/train_atom_vae.py
@@ -106,16 +106,14 @@ def construct_model(run_args):
     assert embedding_size is not None
     assert dictionary_size is not None
 
-    kl, recon, arg_max = molvae.MolVAE(input_feature_dims,
-                                       dictionary_size,
-                                       embedding_size,
-                                       pad_index)(input_)
+    kl, recon = molvae.MolVAE(input_feature_dims,
+                              dictionary_size,
+                              embedding_size,
+                              pad_index)(input_)
 
     vae_loss.append(kl)
     vae_loss.append(recon)
     print("LEN vae loss ", len(vae_loss))
-    #metric layers
-    pred_tensor = lbann.Concatenation(arg_max[:-1], name='pred_tensor')
 
     layers = list(lbann.traverse_layer_graph(input_))
     # Setup objective function
diff --git a/bamboo/unit_tests/test_unit_layer_gru.py b/bamboo/unit_tests/test_unit_layer_gru.py
new file mode 100644
index 00000000000..81c35f5a664
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_gru.py
@@ -0,0 +1,236 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+import scipy.special
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20200909)
+_num_samples = 15
+_sequence_length = 5
+_input_size = 13
+_hidden_size = 7
+_sample_size = _sequence_length*_input_size + _hidden_size
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# NumPy implementation
+# ==============================================
+
+def numpy_gru(x, h, ih_matrix, hh_matrix, ih_bias, hh_bias):
+
+    # Cast inputs to float64
+    if x.dtype is not np.float64:
+        x = x.astype(np.float64)
+    if h.dtype is not np.float64:
+        h = h.astype(np.float64)
+    if ih_matrix.dtype is not np.float64:
+        ih_matrix = ih_matrix.astype(np.float64)
+    if hh_matrix.dtype is not np.float64:
+        hh_matrix = hh_matrix.astype(np.float64)
+    if ih_bias.dtype is not np.float64:
+        ih_bias = ih_bias.astype(np.float64)
+    if hh_bias.dtype is not np.float64:
+        hh_bias = hh_bias.astype(np.float64)
+
+    # Dimensions
+    sequence_length, input_size = x.shape
+    hidden_size = h.shape[0]
+
+    # Unroll GRU
+    y = []
+    for t in range(sequence_length):
+        ih = np.matmul(ih_matrix, x[t]) + ih_bias
+        hh = np.matmul(hh_matrix, h) + hh_bias
+        r = scipy.special.expit(ih[:hidden_size] + hh[:hidden_size])
+        z = scipy.special.expit(ih[hidden_size:2*hidden_size] + hh[hidden_size:2*hidden_size])
+        n = np.tanh(ih[2*hidden_size:] + r*hh[2*hidden_size:])
+        h = (1-z)*n + z*h
+        y.append(h)
+    return np.stack(y)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.SGD()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input')
+    h_weights = lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
+                              name='inital_hidden')
+    input_ = lbann.Identity(lbann.Input())
+    input_slice = lbann.Slice(
+        input_,
+        slice_points=tools.str_list([0, _sequence_length*_input_size, _sample_size]),
+    )
+    x = lbann.Reshape(input_slice, dims=tools.str_list([_sequence_length,_input_size]))
+    x = lbann.Sum(x, lbann.WeightsLayer(weights=x_weights, hint_layer=x))
+    h = lbann.Reshape(input_slice, dims=tools.str_list([_hidden_size]),)
+    h = lbann.Sum(h, lbann.WeightsLayer(weights=h_weights, hint_layer=h))
+    x_lbann = x
+    h_lbann = h
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # 1-layer, uni-directional GRU
+    # ------------------------------------------
+
+    # Weights
+    ih_matrix = np.random.normal(size=(3*_hidden_size,_input_size)).astype(np.float32)
+    hh_matrix = np.random.normal(size=(3*_hidden_size,_hidden_size)).astype(np.float32)
+    ih_bias = np.random.normal(size=(3*_hidden_size,)).astype(np.float32)
+    hh_bias = np.random.normal(size=(3*_hidden_size,)).astype(np.float32)
+    ih_matrix_weights = lbann.Weights(
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(ih_matrix, order='F'))))
+    hh_matrix_weights = lbann.Weights(
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(hh_matrix, order='F'))))
+    ih_bias_weights = lbann.Weights(
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(ih_bias))))
+    hh_bias_weights = lbann.Weights(
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(hh_bias))))
+
+    # LBANN implementation
+    x = x_lbann
+    h = h_lbann
+    y = lbann.GRU(
+        x,
+        h,
+        hidden_size=_hidden_size,
+        weights=[ih_matrix_weights,hh_matrix_weights,ih_bias_weights,hh_bias_weights],
+    )
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='1-layer, unidirectional'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        input_ = get_sample(i).astype(np.float64)
+        x = input_[:_sequence_length*_input_size].reshape((_sequence_length,_input_size))
+        h = input_[_sequence_length*_input_size:]
+        y = numpy_gru(x, h, ih_matrix, hh_matrix, ih_bias, hh_bias)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+# for test in tools.create_tests(setup_experiment, __file__):
+#     globals()[test.__name__] = test
diff --git a/include/lbann/layers/learning/CMakeLists.txt b/include/lbann/layers/learning/CMakeLists.txt
index 71111d57435..a5f96c38e62 100644
--- a/include/lbann/layers/learning/CMakeLists.txt
+++ b/include/lbann/layers/learning/CMakeLists.txt
@@ -9,6 +9,7 @@ set_full_path(THIS_DIR_HEADERS
   entrywise_scale_bias.hpp
   fully_connected.hpp
   fully_connected_cuda.hpp
+  gru.hpp
   learning.hpp
   )
 
diff --git a/include/lbann/layers/learning/gru.hpp b/include/lbann/layers/learning/gru.hpp
new file mode 100644
index 00000000000..562a782f626
--- /dev/null
+++ b/include/lbann/layers/learning/gru.hpp
@@ -0,0 +1,130 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_LEARNING_GRU_HPP_INCLUDED
+#define LBANN_LAYERS_LEARNING_GRU_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+#ifdef LBANN_HAS_CUDNN
+#include "lbann/utils/cudnn.hpp"
+#endif // LBANN_HAS_CUDNN
+
+namespace lbann {
+
+/** @brief Gated recurrent unit
+ *
+ *  Expects two inputs: a 2D input sequence (
+ *  @f$ \text{sequence\_length}\times\text{input\_size} @f$ )
+ *  and a 1D initial hidden state ( @f$ \text{hidden\_size} @f$ ).
+ *
+ *  Uses four weights: "ih\_matrix" (
+ *  @f$ 3 \text{hidden\_size}\times\text{input\_size} @f$ ),
+ *  "hh\_matrix" (
+ *  @f$ 3 \text{hidden\_size}\times\text{hidden\_size} @f$ ),
+ *  "ih_bias" ( @f$ 3 \text{hidden\_size} @f$ ),
+ *  "hh_bias" ( @f$ 3 \text{hidden\_size} @f$ ).
+ *
+ *  @todo Support CPU
+ *  @todo Support bidirectional RNNs
+ *  @todo Support stacked RNNs
+ *
+ *  @warning cuDNN 8 exposes a new RNN API and deprecates the old one.
+ *  Consider reimplementing this layer once cuDNN 8 is the minimum
+ *  version.
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class gru_layer
+  : public data_type_layer<TensorDataType> {
+
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "GRU layer only supports data parallel layout");
+
+public:
+
+  gru_layer(
+    lbann_comm* comm,
+    size_t hidden_size);
+
+  gru_layer(const gru_layer& other);
+  gru_layer& operator=(const gru_layer& other);
+  ~gru_layer() = default;
+
+  gru_layer* copy() const override;
+  std::string get_type() const override;
+  data_layout get_data_layout() const override;
+  El::Device get_device_allocation() const override;
+  description get_description() const override;
+
+protected:
+
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+  void setup_data(size_t max_mini_batch_size) override;
+#ifdef LBANN_HAS_CUDNN
+  void setup_gpu() override;
+#endif // LBANN_HAS_CUDNN
+
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  size_t m_hidden_size;
+
+#ifdef LBANN_HAS_CUDNN
+  using ByteBuffer = hydrogen::simple_buffer<El::byte, Device>;
+  cudnn::RNNDescriptor m_rnn_cudnn_desc;
+  cudnn::TensorDescriptor m_input_cudnn_desc;
+  cudnn::TensorDescriptor m_output_cudnn_desc;
+  cudnn::TensorDescriptor m_hidden_cudnn_desc;
+  cudnn::FilterDescriptor m_packed_weights_cudnn_desc;
+  ByteBuffer m_cudnn_reserve_space;
+#endif // LBANN_HAS_CUDNN
+
+  template <typename T>
+  friend void fp_compute_impl(gru_layer<T,Layout,Device>&);
+  template <typename T>
+  friend void bp_compute_impl(gru_layer<T,Layout,Device>&);
+
+};
+
+// Builder function
+LBANN_DEFINE_LAYER_BUILDER(gru);
+
+// Explicit template instantiation
+#ifdef LBANN_HAS_CUDNN
+#ifndef LBANN_GRU_LAYER_INSTANTIATE
+#define PROTO(T)                                                        \
+  extern template class gru_layer<                                             \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#endif // LBANN_GRU_LAYER_INSTANTIATE
+#endif // LBANN_HAS_CUDNN
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_LEARNING_GRU_HPP_INCLUDED
diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp
index 1496a7211d3..109782a3ab1 100644
--- a/include/lbann/lbann.hpp
+++ b/include/lbann/lbann.hpp
@@ -54,6 +54,7 @@
 #include "lbann/layers/learning/embedding.hpp"
 #include "lbann/layers/learning/channelwise_scale_bias.hpp"
 #include "lbann/layers/learning/entrywise_scale_bias.hpp"
+#include "lbann/layers/learning/gru.hpp"
 
 /// Loss layers
 #include "lbann/layers/loss/categorical_accuracy.hpp"
diff --git a/include/lbann/utils/cudnn.hpp b/include/lbann/utils/cudnn.hpp
index 355d644805e..2d0b76a657f 100644
--- a/include/lbann/utils/cudnn.hpp
+++ b/include/lbann/utils/cudnn.hpp
@@ -143,6 +143,229 @@ void copy_tensor_desc(const cudnnTensorDescriptor_t& src,
 void copy_activation_desc(const cudnnActivationDescriptor_t& src,
                           cudnnActivationDescriptor_t& dst);
 
+////////////////////////////////////////////////////////////
+// Wrapper classes for cuDNN types
+////////////////////////////////////////////////////////////
+
+/** @brief Wrapper around @c cudnnTensorDescriptor_t */
+class TensorDescriptor {
+
+public:
+
+  TensorDescriptor(cudnnTensorDescriptor_t desc=nullptr);
+  template <typename... ArgTs>
+  TensorDescriptor(ArgTs... args) {
+    set(args...);
+  }
+
+  ~TensorDescriptor();
+
+  // Copy-and-swap idiom
+  TensorDescriptor(const TensorDescriptor&);
+  TensorDescriptor(TensorDescriptor&&);
+  TensorDescriptor& operator=(TensorDescriptor);
+  friend void swap(TensorDescriptor& first, TensorDescriptor& second);
+
+  /** @brief Take ownership of cuDNN object */
+  void reset(cudnnTensorDescriptor_t desc=nullptr);
+  /** @brief Return cuDNN object and release ownership */
+  cudnnTensorDescriptor_t release();
+  /** @brief Return cuDNN object without releasing ownership */
+  cudnnTensorDescriptor_t get() const noexcept;
+  /** @brief Return cuDNN object without releasing ownership */
+  operator cudnnTensorDescriptor_t() const noexcept;
+
+  /** @brief Create cuDNN object
+   *
+   *  Does nothing if already created.
+   */
+  void create();
+  /** @brief Configure cuDNN object
+   *
+   *  Creates cuDNN object if needed.
+   */
+  void set(
+    cudnnDataType_t data_type,
+    const std::vector<int>& dims,
+    std::vector<int> strides = {});
+  /** @brief Configure cuDNN object
+   *
+   *  Creates cuDNN object if needed.
+   */
+  template <typename... IntTs>
+  void set(
+    cudnnDataType_t data_type,
+    IntTs... dims) {
+    set(data_type, {static_cast<int>(dims)...});
+  }
+
+private:
+
+  cudnnTensorDescriptor_t desc_{nullptr};
+
+};
+
+/** Wrapper around @c cudnnFilterDescriptor_t */
+class FilterDescriptor {
+
+public:
+
+  FilterDescriptor(cudnnFilterDescriptor_t desc=nullptr);
+  template <typename... ArgTs>
+  FilterDescriptor(ArgTs... args) {
+    set(args...);
+  }
+
+  ~FilterDescriptor();
+
+  // Copy-and-swap idiom
+  FilterDescriptor(const FilterDescriptor&);
+  FilterDescriptor(FilterDescriptor&&);
+  FilterDescriptor& operator=(FilterDescriptor);
+  friend void swap(FilterDescriptor& first, FilterDescriptor& second);
+
+  /** @brief Take ownership of cuDNN object */
+  void reset(cudnnFilterDescriptor_t desc=nullptr);
+  /** @brief Return cuDNN object and release ownership */
+  cudnnFilterDescriptor_t release();
+  /** @brief Return cuDNN object without releasing ownership */
+  cudnnFilterDescriptor_t get() const noexcept;
+  /** @brief Return cuDNN object without releasing ownership */
+  operator cudnnFilterDescriptor_t() const noexcept;
+
+  /** Create cuDNN object
+   *
+   *  Does nothing if already created.
+   */
+  void create();
+  /** Configure cuDNN object
+   *
+   *  Creates cuDNN object if needed.
+   */
+  void set(
+    cudnnDataType_t data_type,
+    cudnnTensorFormat_t format,
+    const std::vector<int>& dims);
+  /** Configure cuDNN object
+   *
+   *  Creates cuDNN object if needed.
+   */
+  template <typename... IntTs>
+  void set(
+    cudnnDataType_t data_type,
+    cudnnTensorFormat_t format,
+    IntTs... dims) {
+    set(data_type, format, {static_cast<int>(dims)...});
+  }
+
+private:
+
+  cudnnFilterDescriptor_t desc_{nullptr};
+
+};
+
+/** Wrapper around @c cudnnDropoutDescriptor_t */
+class DropoutDescriptor {
+
+public:
+
+  DropoutDescriptor(cudnnDropoutDescriptor_t desc=nullptr);
+  template <typename... ArgTs>
+  DropoutDescriptor(ArgTs... args) {
+    set(args...);
+  }
+
+  ~DropoutDescriptor();
+
+  // Copy-and-swap idiom
+  DropoutDescriptor(const DropoutDescriptor&);
+  DropoutDescriptor(DropoutDescriptor&&);
+  DropoutDescriptor& operator=(DropoutDescriptor);
+  friend void swap(DropoutDescriptor& first, DropoutDescriptor& second);
+
+  /** @brief Take ownership of cuDNN object */
+  void reset(cudnnDropoutDescriptor_t desc=nullptr);
+  /** @brief Return cuDNN object and release ownership */
+  cudnnDropoutDescriptor_t release();
+  /** @brief Return cuDNN object without releasing ownership */
+  cudnnDropoutDescriptor_t get() const noexcept;
+  /** @brief Return cuDNN object without releasing ownership */
+  operator cudnnDropoutDescriptor_t() const noexcept;
+
+  /** Create cuDNN object
+   *
+   *  Does nothing if already created.
+   */
+  void create();
+  /** Configure cuDNN object
+   *
+   *  Creates cuDNN object if needed.
+   */
+  void set(
+    float dropout,
+    void* states,
+    size_t states_size,
+    unsigned long long seed);
+
+private:
+
+  cudnnDropoutDescriptor_t desc_{nullptr};
+
+};
+
+/** Wrapper around @c cudnnRNNDescriptor_t */
+class RNNDescriptor {
+
+public:
+
+  RNNDescriptor(cudnnRNNDescriptor_t desc=nullptr);
+  template <typename... ArgTs>
+  RNNDescriptor(ArgTs... args) {
+    set(args...);
+  }
+
+  ~RNNDescriptor();
+
+  // Copy-and-swap idiom
+  RNNDescriptor(const RNNDescriptor&);
+  RNNDescriptor(RNNDescriptor&&);
+  RNNDescriptor& operator=(RNNDescriptor);
+  friend void swap(RNNDescriptor& first, RNNDescriptor& second);
+
+  /** @brief Take ownership of cuDNN object */
+  void reset(cudnnRNNDescriptor_t desc=nullptr);
+  /** @brief Return cuDNN object and release ownership */
+  cudnnRNNDescriptor_t release();
+  /** @brief Return cuDNN object without releasing ownership */
+  cudnnRNNDescriptor_t get() const noexcept;
+  /** @brief Return cuDNN object without releasing ownership */
+  operator cudnnRNNDescriptor_t() const noexcept;
+
+  /** Create cuDNN object
+   *
+   *  Does nothing if already created.
+   */
+  void create();
+  /** Configure cuDNN object
+   *
+   *  Creates cuDNN object if needed.
+   */
+  void set(
+    size_t hidden_size,
+    size_t num_layers,
+    cudnnDropoutDescriptor_t dropout_desc,
+    cudnnRNNInputMode_t input_mode,
+    cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode,
+    cudnnRNNAlgo_t algo,
+    cudnnDataType_t math_precision);
+
+private:
+
+  cudnnRNNDescriptor_t desc_{nullptr};
+
+};
+
 ////////////////////////////////////////////////////////////
 // cuDNN tensor managers
 ////////////////////////////////////////////////////////////
diff --git a/python/lbann/modules/rnn.py b/python/lbann/modules/rnn.py
index 4d3ee1a5982..630ca23e31a 100644
--- a/python/lbann/modules/rnn.py
+++ b/python/lbann/modules/rnn.py
@@ -213,6 +213,13 @@ def __init__(self, size, bias = True,
             data_layout=self.data_layout
         )
 
+        self.ones = lbann.Constant(
+            value=1.0,
+            num_neurons=str(size),
+            data_layout=self.data_layout,
+            name=self.name+'_ones',
+        )
+
     def forward(self, x, prev_state):
         """Apply GRU step.
 
@@ -285,7 +292,7 @@ def forward(self, x, prev_state):
             lbann.Add(
                 lbann.Multiply(
                     lbann.WeightedSum(
-                        lbann.Constant(value=1.0, hint_layer=zt, data_layout=self.data_layout),
+                        self.ones,
                         zt,
                         scaling_factors='1 -1', data_layout=self.data_layout
                     ),
diff --git a/src/callbacks/ltfb.cpp b/src/callbacks/ltfb.cpp
index c682e628a4c..6f277258ed4 100644
--- a/src/callbacks/ltfb.cpp
+++ b/src/callbacks/ltfb.cpp
@@ -515,8 +515,9 @@ void ltfb::on_batch_begin(model *m) {
   // Choose tournament winner
   // Note: restore local model data if it got a better score.
   El::Int tournament_winner = partner_trainer;
-  if ((m_low_score_wins && local_score <= partner_score) ||
-      (!m_low_score_wins && local_score >= partner_score)) {
+  if ((m_low_score_wins && local_score <= partner_score)
+      || (!m_low_score_wins && local_score >= partner_score)
+      || (!std::isnan(local_score) && std::isnan(partner_score))) {
     tournament_winner = local_trainer;
     switch (m_comm_algo) {
     case communication_algorithm::sendrecv_weights:
diff --git a/src/layers/learning/CMakeLists.txt b/src/layers/learning/CMakeLists.txt
index 3b9207d7f8a..55e19abe0b6 100644
--- a/src/layers/learning/CMakeLists.txt
+++ b/src/layers/learning/CMakeLists.txt
@@ -10,6 +10,7 @@ set_full_path(THIS_DIR_SOURCES
   embedding.cpp
   embedding_builder.cpp
   fully_connected.cpp
+  gru.cpp
   )
 
 if (LBANN_HAS_CUDA)
diff --git a/src/layers/learning/gru.cpp b/src/layers/learning/gru.cpp
new file mode 100644
index 00000000000..a6df3666f5c
--- /dev/null
+++ b/src/layers/learning/gru.cpp
@@ -0,0 +1,956 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_GRU_LAYER_INSTANTIATE
+#include "lbann/layers/learning/gru.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/weights/initializer.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include <layers.pb.h>
+
+namespace lbann {
+
+// ---------------------------------------------
+// Life cycle
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+gru_layer<TensorDataType, Layout, Device>::gru_layer(lbann_comm* comm, size_t hidden_size)
+  : data_type_layer<TensorDataType>(comm),
+    m_hidden_size{hidden_size} {
+  this->m_expected_num_parent_layers = 2;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+gru_layer<TensorDataType, Layout, Device>::gru_layer(const gru_layer& other)
+  : data_type_layer<TensorDataType>(other),
+    m_hidden_size{other.m_hidden_size}
+#ifdef LBANN_HAS_CUDNN
+  , m_rnn_cudnn_desc{other.m_rnn_cudnn_desc},
+    m_input_cudnn_desc{other.m_input_cudnn_desc},
+    m_output_cudnn_desc{other.m_output_cudnn_desc},
+    m_hidden_cudnn_desc{other.m_hidden_cudnn_desc},
+    m_packed_weights_cudnn_desc{other.m_packed_weights_cudnn_desc}
+#endif // LBANN_HAS_CUDNN
+{
+#ifdef LBANN_HAS_CUDNN
+  /// @todo Copy m_cudnn_reserve_space?
+#endif // LBANN_HAS_CUDNN
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+gru_layer<TensorDataType, Layout, Device>& gru_layer<TensorDataType, Layout, Device>
+::operator=(const gru_layer& other) {
+  data_type_layer<TensorDataType>::operator=(other);
+  m_hidden_size = other.m_hidden_size;
+#ifdef LBANN_HAS_CUDNN
+  m_rnn_cudnn_desc = other.m_rnn_cudnn_desc;
+  m_input_cudnn_desc = other.m_input_cudnn_desc;
+  m_output_cudnn_desc = other.m_output_cudnn_desc;
+  m_hidden_cudnn_desc = other.m_hidden_cudnn_desc;
+  m_packed_weights_cudnn_desc = other.m_packed_weights_cudnn_desc;
+  /// @todo Copy m_cudnn_reserve_space?
+#endif // LBANN_HAS_CUDNN
+  return *this;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+gru_layer<TensorDataType,Layout,Device>*
+gru_layer<TensorDataType,Layout,Device>
+::copy() const
+{
+  return new gru_layer(*this);
+}
+
+// ---------------------------------------------
+// Query functions
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::string
+gru_layer<TensorDataType,Layout,Device>
+::get_type() const
+{
+  return "GRU";
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+data_layout
+gru_layer<TensorDataType,Layout,Device>
+::get_data_layout() const
+{
+  return Layout;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+El::Device
+gru_layer<TensorDataType,Layout,Device>
+::get_device_allocation() const
+{
+  return Device;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+description
+gru_layer<TensorDataType,Layout,Device>
+::get_description() const
+{
+  auto desc = data_type_layer<TensorDataType>::get_description();
+  desc.add("Hidden size", m_hidden_size);
+  return desc;
+}
+
+// ---------------------------------------------
+// Setup
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void gru_layer<TensorDataType, Layout, Device>::setup_dims(DataReaderMetaData& dr_metadata) {
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+  const int sequence_length = this->get_input_dims(0)[0];
+  if (static_cast<size_t>(this->get_input_size(1)) != m_hidden_size) {
+    LBANN_ERROR(
+      this->get_type()," layer \"",this->get_name(),"\" ",
+      "has an invalid input tensor for the initial hidden state");
+  }
+  const std::vector<int> output_dims = {sequence_length, static_cast<int>(m_hidden_size)};
+  this->set_output_dims(output_dims);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void gru_layer<TensorDataType, Layout, Device>
+::setup_data(size_t max_mini_batch_size) {
+  data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
+
+  const size_t sequence_length = this->get_input_dims()[0];
+  const size_t input_size = this->get_input_size(0) / sequence_length;
+
+  // Construct default weights if needed
+  if (!this->has_weights()) {
+    const std::vector<std::string> weight_names
+      = {"ih_matrix", "hh_matrix", "ih_bias", "hh_bias"};
+    this->set_num_weights(4);
+    const auto scale = El::To<TensorDataType>(1./std::sqrt(m_hidden_size));
+    for (size_t i=0; i<4; ++i) {
+      auto w = make_unique<data_type_weights<TensorDataType>>(this->get_comm());
+      auto init = make_unique<uniform_initializer<TensorDataType>>(-scale, scale);
+      auto opt = this->m_model->template create_optimizer<TensorDataType>();
+      w->set_name(this->get_name() + "_" + weight_names[i]);
+      w->set_initializer(std::move(init));
+      w->set_optimizer(std::move(opt));
+      this->set_weights(i, w.get());
+      this->m_model->add_weights(std::move(w));
+    }
+  }
+  if (this->num_weights() != 4) {
+    LBANN_ERROR(
+      "attempted to setup ",
+      this->get_type()," layer \"",this->get_name(),"\" ",
+      "with an invalid number of weights ",
+      "(expected 4, found ",this->num_weights(),")");
+  }
+
+  // Setup weight dimensions and distribution
+  auto& ih_matrix = this->get_weights(0);
+  auto& hh_matrix = this->get_weights(1);
+  auto& ih_bias = this->get_weights(2);
+  auto& hh_bias = this->get_weights(3);
+  ih_matrix.set_dims({static_cast<int>(3*m_hidden_size)}, {static_cast<int>(input_size)});
+  hh_matrix.set_dims({static_cast<int>(3*m_hidden_size)}, {static_cast<int>(m_hidden_size)});
+  ih_bias.set_dims({static_cast<int>(3*m_hidden_size)});
+  hh_bias.set_dims({static_cast<int>(3*m_hidden_size)});
+  auto dist = this->get_prev_activations().DistData();
+  dist.colDist = El::STAR;
+  dist.rowDist = El::STAR;
+  ih_matrix.set_matrix_distribution(dist);
+  hh_matrix.set_matrix_distribution(dist);
+  ih_bias.set_matrix_distribution(dist);
+  hh_bias.set_matrix_distribution(dist);
+
+}
+
+#ifdef LBANN_HAS_CUDNN
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void gru_layer<TensorDataType, Layout, Device>::setup_gpu() {
+
+  // Dimensions
+  const size_t sequence_length = this->get_input_dims(0)[0];
+  const size_t input_size = this->get_input_size(0) / sequence_length;
+
+  // GPU objects
+  auto&& handle = cudnn::get_handle();
+  auto data_type = cudnn::get_data_type<TensorDataType>();
+
+  // RNN descriptor
+  size_t dropout_state_size;
+  CHECK_CUDNN(cudnnDropoutGetStatesSize(handle, &dropout_state_size));
+  cudnn::DropoutDescriptor dropout_desc(0.f, nullptr, dropout_state_size, 0);
+  m_rnn_cudnn_desc.set(
+    m_hidden_size,
+    1,  // num_layers
+    dropout_desc,
+    CUDNN_LINEAR_INPUT,
+    CUDNN_UNIDIRECTIONAL,
+    CUDNN_GRU,
+    CUDNN_RNN_ALGO_STANDARD,
+    data_type);
+
+  // Input and output tensor descriptors
+  m_input_cudnn_desc.set(data_type, 1, input_size, 1);
+  m_output_cudnn_desc.set(data_type, 1, m_hidden_size, 1);
+  m_hidden_cudnn_desc.set(data_type, 1, 1, m_hidden_size);
+
+  // Packed weights descriptor
+  size_t weights_size;
+  CHECK_CUDNN(
+    cudnnGetRNNParamsSize(
+      handle,
+      m_rnn_cudnn_desc,
+      m_input_cudnn_desc,
+      &weights_size,
+      data_type));
+  m_packed_weights_cudnn_desc.set(
+    data_type,
+    CUDNN_TENSOR_NCHW,
+    weights_size / sizeof(TensorDataType),
+    1,
+    1);
+
+}
+#endif // LBANN_HAS_CUDNN
+
+// ---------------------------------------------
+// Forward prop
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void gru_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp_compute_impl(*this);
+}
+
+namespace {
+#ifdef LBANN_HAS_CUDNN
+template <typename TensorDataType>
+hydrogen::simple_buffer<El::byte, El::Device::GPU> pack_cudnn_rnn_weights(
+  const cudnnHandle_t& handle,
+  const cudnn::RNNDescriptor& rnn_desc,
+  const cudnn::TensorDescriptor& input_desc,
+  const cudnn::FilterDescriptor& weights_desc,
+  const El::SyncInfo<El::Device::GPU>& sync_info,
+  size_t input_size,
+  size_t hidden_size,
+  const El::Matrix<TensorDataType,El::Device::GPU>& ih_matrix,
+  const El::Matrix<TensorDataType,El::Device::GPU>& hh_matrix,
+  const El::Matrix<TensorDataType,El::Device::GPU>& ih_bias,
+  const El::Matrix<TensorDataType,El::Device::GPU>& hh_bias) {
+
+  // Allocate buffer for packed weights
+  size_t packed_weights_size;
+  CHECK_CUDNN(
+    cudnnGetRNNParamsSize(
+      handle,
+      rnn_desc,
+      input_desc,
+      &packed_weights_size,
+      cudnn::get_data_type<TensorDataType>()));
+  hydrogen::simple_buffer<El::byte, El::Device::GPU> packed_weights(packed_weights_size, sync_info);
+
+  // Construct objects
+  static cudnn::FilterDescriptor result_weights_desc;
+  result_weights_desc.create();
+  El::Matrix<TensorDataType,El::Device::GPU> packed_weights_view;
+  packed_weights_view.SetSyncInfo(sync_info);
+
+  // Functions to get pointers in packed weights buffer
+  auto get_matrix_ptr = [&] (size_t id) -> TensorDataType* {
+    TensorDataType* ptr;
+    CHECK_CUDNN(
+      cudnnGetRNNLinLayerMatrixParams(
+        handle,
+        rnn_desc,
+        0,  // pseudoLayer
+        input_desc,
+        weights_desc,
+        packed_weights.data(),
+        id, // linLayerID
+        result_weights_desc,
+        reinterpret_cast<void**>(&ptr)));
+    return ptr;
+  };
+  auto get_bias_ptr = [&] (size_t id) -> TensorDataType* {
+    TensorDataType* ptr;
+    CHECK_CUDNN(
+      cudnnGetRNNLinLayerBiasParams(
+        handle,
+        rnn_desc,
+        0,  // pseudoLayer
+        input_desc,
+        weights_desc,
+        packed_weights.data(),
+        id, // linLayerID
+        result_weights_desc,
+        reinterpret_cast<void**>(&ptr)));
+    return ptr;
+  };
+
+  // Copy from ih_matrix
+  for (auto i : {0, 1, 2}) {
+    packed_weights_view.Attach(
+      input_size,
+      hidden_size,
+      get_matrix_ptr(i),
+      input_size);
+    El::Transpose(
+      ih_matrix(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL),
+      packed_weights_view,
+      false);
+  }
+
+  // Copy from hh_matrix
+  for (auto i : {0, 1, 2}) {
+    packed_weights_view.Attach(
+      hidden_size,
+      hidden_size,
+      get_matrix_ptr(3+i),
+      hidden_size);
+    El::Transpose(
+      hh_matrix(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL),
+      packed_weights_view,
+      false);
+  }
+
+  // Copy from ih_bias
+  for (auto i : {0, 1, 2}) {
+    packed_weights_view.Attach(
+      hidden_size,
+      1,
+      get_bias_ptr(i),
+      hidden_size);
+    El::Copy(
+      ih_bias(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL),
+      packed_weights_view);
+  }
+
+  // Copy from hh_bias
+  for (auto i : {0, 1, 2}) {
+    packed_weights_view.Attach(
+      hidden_size,
+      1,
+      get_bias_ptr(3+i),
+      hidden_size);
+    El::Copy(
+      hh_bias(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL),
+      packed_weights_view);
+  }
+
+  return packed_weights;
+}
+#endif // LBANN_HAS_CUDNN
+} // namespace <anon>
+
+#ifdef LBANN_HAS_CUDNN
+template <typename TensorDataType>
+void fp_compute_impl(
+  gru_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::GPU>& l) {
+  using LocalMat = El::Matrix<TensorDataType, El::Device::GPU>;
+  using ByteBuffer = hydrogen::simple_buffer<El::byte, El::Device::GPU>;
+
+  // Matrices
+  const auto& local_input_sequence
+    = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(0));
+  const auto& local_init_hidden
+    = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(1));
+  auto& local_output_sequence
+    = dynamic_cast<LocalMat&>(l.get_local_activations());
+  const auto& ih_matrix
+    = dynamic_cast<const LocalMat&>(l.weights_values(0).LockedMatrix());
+  const auto& hh_matrix
+    = dynamic_cast<const LocalMat&>(l.weights_values(1).LockedMatrix());
+  const auto& ih_bias
+    = dynamic_cast<const LocalMat&>(l.weights_values(2).LockedMatrix());
+  const auto& hh_bias
+    = dynamic_cast<const LocalMat&>(l.weights_values(3).LockedMatrix());
+
+  // Dimensions
+  const size_t sequence_length = l.get_input_dims(0)[0];
+  const size_t mini_batch_size = local_input_sequence.Width();
+  const size_t input_size = l.get_input_size(0) / sequence_length;
+  const size_t hidden_size = l.m_hidden_size;
+
+  // Return immediately if there is no local data
+  if (mini_batch_size <= 0) {
+    return;
+  }
+
+  // GPU objects
+  auto&& sync_info = local_input_sequence.GetSyncInfo();
+  auto&& handle = cudnn::get_handle();
+  const auto data_type = cudnn::get_data_type<TensorDataType>();
+
+  // Configure input and output tensor descriptors
+  auto& input_desc = l.m_input_cudnn_desc;
+  auto& output_desc = l.m_output_cudnn_desc;
+  auto& hidden_desc = l.m_hidden_cudnn_desc;
+  input_desc.set(data_type, mini_batch_size, input_size, 1);
+  output_desc.set(data_type, mini_batch_size, hidden_size, 1);
+  hidden_desc.set(data_type, 1, mini_batch_size, hidden_size);
+  std::vector<cudnnTensorDescriptor_t>
+    input_desc_list(sequence_length, input_desc),
+    output_desc_list(sequence_length, output_desc);
+
+  // Reorder input tensor dims
+  // Note: cuDNN uses sequence_length x mini_batch_size x hidden_size
+  /// @todo Consider custom kernel
+  LocalMat input_sequence_workspace, output_sequence_workspace;
+  input_sequence_workspace.SetSyncInfo(sync_info);
+  output_sequence_workspace.SetSyncInfo(sync_info);
+  input_sequence_workspace.Resize(mini_batch_size*input_size, sequence_length);
+  output_sequence_workspace.Resize(mini_batch_size*hidden_size, sequence_length);
+  for (size_t i=0; i<sequence_length; ++i) {
+    const auto input_sequence_view
+      = local_input_sequence(El::IR(i*input_size, (i+1)*input_size), El::ALL);
+    LocalMat input_sequence_workspace_view(
+      input_size,
+      mini_batch_size,
+      input_sequence_workspace.Buffer(0, i),
+      input_size);
+    input_sequence_workspace_view.SetSyncInfo(sync_info);
+    El::Copy(input_sequence_view, input_sequence_workspace_view);
+  }
+
+  // Pack weights into workspace buffer
+  auto packed_weights = pack_cudnn_rnn_weights(
+    handle,
+    l.m_rnn_cudnn_desc,
+    input_desc,
+    l.m_packed_weights_cudnn_desc,
+    sync_info,
+    input_size,
+    hidden_size,
+    ih_matrix,
+    hh_matrix,
+    ih_bias,
+    hh_bias);
+
+  // Allocate cuDNN workspace buffers
+  /// @todo Handle synchronization for m_cudnn_reserve_space
+  size_t cudnn_workspace_size, cudnn_reserve_space_size;
+  CHECK_CUDNN(
+    cudnnGetRNNWorkspaceSize(
+      handle,
+      l.m_rnn_cudnn_desc,
+      sequence_length,
+      input_desc_list.data(),
+      &cudnn_workspace_size));
+  CHECK_CUDNN(
+    cudnnGetRNNTrainingReserveSize(
+      handle,
+      l.m_rnn_cudnn_desc,
+      sequence_length,
+      input_desc_list.data(),
+      &cudnn_reserve_space_size));
+  ByteBuffer cudnn_workspace(cudnn_workspace_size, sync_info);
+  l.m_cudnn_reserve_space.allocate(cudnn_reserve_space_size);
+
+  // Launch cuDNN GRU
+  CHECK_CUDNN(
+    cudnnRNNForwardTraining(
+      handle,
+      l.m_rnn_cudnn_desc,
+      sequence_length,
+      input_desc_list.data(),
+      input_sequence_workspace.LockedBuffer(),
+      hidden_desc,
+      local_init_hidden.LockedBuffer(),
+      hidden_desc,  // cxDesc
+      nullptr,      // cx
+      l.m_packed_weights_cudnn_desc,
+      packed_weights.data(),
+      output_desc_list.data(),
+      output_sequence_workspace.Buffer(),
+      hidden_desc,  // hyDesc
+      nullptr,      // hy
+      hidden_desc,  // cyDesc
+      nullptr,      // cy
+      cudnn_workspace.data(),
+      cudnn_workspace.size(),
+      l.m_cudnn_reserve_space.data(),
+      l.m_cudnn_reserve_space.size()));
+
+  // Reorder output tensor dims
+  // Note: cuDNN uses sequence_length x mini_batch_size x hidden_size
+  /// @todo Consider custom kernel
+  for (size_t i=0; i<sequence_length; ++i) {
+    LocalMat output_sequence_workspace_view(
+      hidden_size,
+      mini_batch_size,
+      output_sequence_workspace.LockedBuffer(0, i),
+      hidden_size);
+    output_sequence_workspace_view.SetSyncInfo(sync_info);
+    auto output_sequence_view
+      = local_output_sequence(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL);
+    El::Copy(output_sequence_workspace_view, output_sequence_view);
+  }
+
+}
+#endif // LBANN_HAS_CUDNN
+
+// ---------------------------------------------
+// Back prop
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void gru_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp_compute_impl(*this);
+}
+
+namespace {
+#ifdef LBANN_HAS_CUDNN
+template <typename TensorDataType>
+void unpack_cudnn_rnn_weights(
+  const cudnnHandle_t& handle,
+  const cudnn::RNNDescriptor& rnn_desc,
+  const cudnn::TensorDescriptor& input_desc,
+  const cudnn::FilterDescriptor& weights_desc,
+  const El::SyncInfo<El::Device::GPU>& sync_info,
+  size_t input_size,
+  size_t hidden_size,
+  const TensorDataType* packed_weights_buffer,
+  El::Matrix<TensorDataType,El::Device::GPU>& ih_matrix,
+  El::Matrix<TensorDataType,El::Device::GPU>& hh_matrix,
+  El::Matrix<TensorDataType,El::Device::GPU>& ih_bias,
+  El::Matrix<TensorDataType,El::Device::GPU>& hh_bias) {
+
+  // Construct objects
+  static cudnn::FilterDescriptor result_weights_desc;
+  result_weights_desc.create();
+  El::Matrix<TensorDataType,El::Device::GPU> packed_weights_view;
+  packed_weights_view.SetSyncInfo(sync_info);
+
+  // Functions to get pointers in packed weights buffer
+  auto get_matrix_ptr = [&] (size_t id) -> const TensorDataType* {
+    TensorDataType* ptr;
+    CHECK_CUDNN(
+      cudnnGetRNNLinLayerMatrixParams(
+        handle,
+        rnn_desc,
+        0,  // pseudoLayer
+        input_desc,
+        weights_desc,
+        const_cast<void*>(reinterpret_cast<const void*>(packed_weights_buffer)),
+        id, // linLayerID
+        result_weights_desc,
+        reinterpret_cast<void**>(&ptr)));
+    return ptr;
+  };
+  auto get_bias_ptr = [&] (size_t id) -> const TensorDataType* {
+    TensorDataType* ptr;
+    CHECK_CUDNN(
+      cudnnGetRNNLinLayerBiasParams(
+        handle,
+        rnn_desc,
+        0,  // pseudoLayer
+        input_desc,
+        weights_desc,
+        const_cast<void*>(reinterpret_cast<const void*>(packed_weights_buffer)),
+        id, // linLayerID
+        result_weights_desc,
+        reinterpret_cast<void**>(&ptr)));
+    return ptr;
+  };
+
+  // Copy from ih_matrix
+  for (auto i : {0, 1, 2}) {
+    packed_weights_view.LockedAttach(
+      input_size,
+      hidden_size,
+      get_matrix_ptr(i),
+      input_size);
+    auto ih_matrix_view = ih_matrix(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL);
+    El::Transpose(packed_weights_view, ih_matrix_view, false);
+  }
+
+  // Copy from hh_matrix
+  for (auto i : {0, 1, 2}) {
+    packed_weights_view.LockedAttach(
+      hidden_size,
+      hidden_size,
+      get_matrix_ptr(3+i),
+      hidden_size);
+    auto hh_matrix_view = hh_matrix(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL);
+    El::Transpose(packed_weights_view, hh_matrix_view, false);
+  }
+
+  // Copy from ih_bias
+  for (auto i : {0, 1, 2}) {
+    packed_weights_view.LockedAttach(
+      hidden_size,
+      1,
+      get_bias_ptr(i),
+      hidden_size);
+    auto ih_bias_view = ih_bias(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL);
+    El::Copy(packed_weights_view, ih_bias_view);
+  }
+
+  // Copy from hh_bias
+  for (auto i : {0, 1, 2}) {
+    packed_weights_view.LockedAttach(
+      hidden_size,
+      1,
+      get_bias_ptr(3+i),
+      hidden_size);
+    auto hh_bias_view = hh_bias(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL);
+    El::Copy(packed_weights_view, hh_bias_view);
+  }
+
+}
+#endif // LBANN_HAS_CUDNN
+} // namespace <anon>
+
+#ifdef LBANN_HAS_CUDNN
+template <typename TensorDataType>
+void bp_compute_impl(
+  gru_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::GPU>& l) {
+  using LocalMat = El::Matrix<TensorDataType, El::Device::GPU>;
+  using ByteBuffer = hydrogen::simple_buffer<El::byte, El::Device::GPU>;
+
+  // Matrices
+  const auto& local_input_sequence
+    = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(0));
+  const auto& local_init_hidden
+    = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(1));
+  const auto& local_output_sequence
+    = dynamic_cast<const LocalMat&>(l.get_local_activations());
+  const auto& local_output_sequence_grad
+    = dynamic_cast<const LocalMat&>(l.get_local_prev_error_signals());
+  auto& local_input_sequence_grad
+    = dynamic_cast<LocalMat&>(l.get_local_error_signals(0));
+  auto& local_init_hidden_grad
+    = dynamic_cast<LocalMat&>(l.get_local_error_signals(1));
+  const auto& ih_matrix
+    = dynamic_cast<const LocalMat&>(l.weights_values(0).LockedMatrix());
+  const auto& hh_matrix
+    = dynamic_cast<const LocalMat&>(l.weights_values(1).LockedMatrix());
+  const auto& ih_bias
+    = dynamic_cast<const LocalMat&>(l.weights_values(2).LockedMatrix());
+  const auto& hh_bias
+    = dynamic_cast<const LocalMat&>(l.weights_values(3).LockedMatrix());
+
+  // Dimensions
+  const size_t sequence_length = l.get_input_dims(0)[0];
+  const size_t mini_batch_size = local_input_sequence.Width();
+  const size_t input_size = l.get_input_size(0) / sequence_length;
+  const size_t hidden_size = l.m_hidden_size;
+
+  // GPU objects
+  auto&& sync_info = local_input_sequence.GetSyncInfo();
+  auto&& handle = cudnn::get_handle();
+
+  // Define closure to send weight gradients to optimizers
+  LocalMat ih_matrix_grad, hh_matrix_grad, ih_bias_grad, hh_bias_grad;
+  ih_matrix_grad.SetSyncInfo(sync_info);
+  hh_matrix_grad.SetSyncInfo(sync_info);
+  ih_bias_grad.SetSyncInfo(sync_info);
+  hh_bias_grad.SetSyncInfo(sync_info);
+  ih_matrix_grad.Resize(3*hidden_size, input_size);
+  hh_matrix_grad.Resize(3*hidden_size, hidden_size);
+  ih_bias_grad.Resize(3*hidden_size, 1);
+  hh_bias_grad.Resize(3*hidden_size, 1);
+  auto send_weight_grads_to_optimizers = [&] () {
+    TensorDataType buf_scale, in_scale;
+    auto&& ih_matrix_opt = l.get_weights(0).get_optimizer();
+    auto&& hh_matrix_opt = l.get_weights(1).get_optimizer();
+    auto&& ih_bias_opt = l.get_weights(2).get_optimizer();
+    auto&& hh_bias_opt = l.get_weights(3).get_optimizer();
+    if (ih_matrix_opt != nullptr) {
+      auto& buf = ih_matrix_opt->get_gradient_buffer(buf_scale, in_scale, true);
+      El::Scale(buf_scale, buf);
+      El::Axpy(in_scale, ih_matrix_grad, buf.Matrix());
+    }
+    if (hh_matrix_opt != nullptr) {
+      auto& buf = hh_matrix_opt->get_gradient_buffer(buf_scale, in_scale, true);
+      El::Scale(buf_scale, buf);
+      El::Axpy(in_scale, hh_matrix_grad, buf.Matrix());
+    }
+    if (ih_bias_opt != nullptr) {
+      auto& buf = ih_bias_opt->get_gradient_buffer(buf_scale, in_scale, true);
+      El::Scale(buf_scale, buf);
+      El::Axpy(in_scale, ih_bias_grad, buf.Matrix());
+    }
+    if (hh_bias_opt != nullptr) {
+      auto& buf = hh_bias_opt->get_gradient_buffer(buf_scale, in_scale, true);
+      El::Scale(buf_scale, buf);
+      El::Axpy(in_scale, hh_bias_grad, buf.Matrix());
+    }
+  };
+
+  // Return immediately if there is no local data
+  if (mini_batch_size <= 0) {
+    El::Zero(ih_matrix_grad);
+    El::Zero(hh_matrix_grad);
+    El::Zero(ih_bias_grad);
+    El::Zero(hh_bias_grad);
+    send_weight_grads_to_optimizers();
+  }
+
+  // Configure input and output tensor descriptors
+  // Note: Descriptor dims have already been set in forward prop
+  auto& input_desc = l.m_input_cudnn_desc;
+  auto& output_desc = l.m_output_cudnn_desc;
+  auto& hidden_desc = l.m_hidden_cudnn_desc;
+  std::vector<cudnnTensorDescriptor_t>
+    input_desc_list(sequence_length, input_desc),
+    output_desc_list(sequence_length, output_desc);
+
+  // Reorder tensor dims
+  // Note: cuDNN uses sequence_length x mini_batch_size x size
+  /// @todo Consider custom kernel
+  LocalMat input_sequence_workspace, output_sequence_workspace;
+  LocalMat input_sequence_grad_workspace, output_sequence_grad_workspace;
+  input_sequence_workspace.SetSyncInfo(sync_info);
+  output_sequence_workspace.SetSyncInfo(sync_info);
+  input_sequence_grad_workspace.SetSyncInfo(sync_info);
+  output_sequence_grad_workspace.SetSyncInfo(sync_info);
+  input_sequence_workspace.Resize(mini_batch_size*input_size, sequence_length);
+  output_sequence_workspace.Resize(mini_batch_size*hidden_size, sequence_length);
+  input_sequence_grad_workspace.Resize(mini_batch_size*input_size, sequence_length);
+  output_sequence_grad_workspace.Resize(mini_batch_size*hidden_size, sequence_length);
+  for (size_t i=0; i<sequence_length; ++i) {
+    const auto input_sequence_view
+      = local_input_sequence(El::IR(i*input_size, (i+1)*input_size), El::ALL);
+    LocalMat input_sequence_workspace_view(
+      input_size,
+      mini_batch_size,
+      input_sequence_workspace.Buffer(0, i),
+      input_size);
+    input_sequence_workspace_view.SetSyncInfo(sync_info);
+    El::Copy(input_sequence_view, input_sequence_workspace_view);
+  }
+  for (size_t i=0; i<sequence_length; ++i) {
+    const auto output_sequence_view
+      = local_output_sequence(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL);
+    LocalMat output_sequence_workspace_view(
+      hidden_size,
+      mini_batch_size,
+      output_sequence_workspace.Buffer(0, i),
+      hidden_size);
+    output_sequence_workspace_view.SetSyncInfo(sync_info);
+    El::Copy(output_sequence_view, output_sequence_workspace_view);
+  }
+  for (size_t i=0; i<sequence_length; ++i) {
+    const auto output_sequence_grad_view
+      = local_output_sequence_grad(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL);
+    LocalMat output_sequence_grad_workspace_view(
+      hidden_size,
+      mini_batch_size,
+      output_sequence_grad_workspace.Buffer(0, i),
+      hidden_size);
+    output_sequence_grad_workspace_view.SetSyncInfo(sync_info);
+    El::Copy(output_sequence_grad_view, output_sequence_grad_workspace_view);
+  }
+
+  // Pack weights into workspace buffer
+  auto packed_weights = pack_cudnn_rnn_weights(
+    handle,
+    l.m_rnn_cudnn_desc,
+    input_desc,
+    l.m_packed_weights_cudnn_desc,
+    sync_info,
+    input_size,
+    hidden_size,
+    ih_matrix,
+    hh_matrix,
+    ih_bias,
+    hh_bias);
+  LocalMat weights_grad_workspace;
+  weights_grad_workspace.SetSyncInfo(sync_info);
+  El::Zeros(
+    weights_grad_workspace,
+    packed_weights.size() / sizeof(TensorDataType),
+    1);
+
+  // Allocate cuDNN workspace buffers
+  size_t cudnn_workspace_size;
+  CHECK_CUDNN(
+    cudnnGetRNNWorkspaceSize(
+      handle,
+      l.m_rnn_cudnn_desc,
+      sequence_length,
+      input_desc_list.data(),
+      &cudnn_workspace_size));
+  ByteBuffer cudnn_workspace(cudnn_workspace_size, sync_info);
+
+  // Launch cuDNN GRU backprop
+  CHECK_CUDNN(
+    cudnnRNNBackwardData(
+      handle,
+      l.m_rnn_cudnn_desc,
+      sequence_length,
+      output_desc_list.data(),
+      output_sequence_workspace.LockedBuffer(),
+      output_desc_list.data(),
+      output_sequence_grad_workspace.LockedBuffer(),
+      hidden_desc,  // dhyDesc
+      nullptr,
+      hidden_desc,  // dcyDesc
+      nullptr,
+      l.m_packed_weights_cudnn_desc,
+      packed_weights.data(),
+      hidden_desc,
+      local_init_hidden.LockedBuffer(),
+      hidden_desc,  // cxDesc
+      nullptr,
+      input_desc_list.data(),
+      input_sequence_grad_workspace.Buffer(),
+      hidden_desc,
+      local_init_hidden_grad.Buffer(),
+      hidden_desc,  // dcxDesc
+      nullptr,
+      cudnn_workspace.data(),
+      cudnn_workspace.size(),
+      l.m_cudnn_reserve_space.data(),
+      l.m_cudnn_reserve_space.size()));
+  CHECK_CUDNN(
+    cudnnRNNBackwardWeights(
+      handle,
+      l.m_rnn_cudnn_desc,
+      sequence_length,
+      input_desc_list.data(),
+      input_sequence_workspace.LockedBuffer(),
+      hidden_desc,
+      local_init_hidden.LockedBuffer(),
+      output_desc_list.data(),
+      output_sequence_workspace.LockedBuffer(),
+      cudnn_workspace.data(),
+      cudnn_workspace.size(),
+      l.m_packed_weights_cudnn_desc,
+      weights_grad_workspace.Buffer(),
+      l.m_cudnn_reserve_space.data(),
+      l.m_cudnn_reserve_space.size()));
+
+  // Send gradients to optimizers
+  unpack_cudnn_rnn_weights(
+    handle,
+    l.m_rnn_cudnn_desc,
+    input_desc,
+    l.m_packed_weights_cudnn_desc,
+    sync_info,
+    input_size,
+    hidden_size,
+    weights_grad_workspace.LockedBuffer(),
+    ih_matrix_grad,
+    hh_matrix_grad,
+    ih_bias_grad,
+    hh_bias_grad);
+  send_weight_grads_to_optimizers();
+
+  // Reorder input grad tensor dims
+  // Note: cuDNN uses sequence_length x mini_batch_size x input_size
+  /// @todo Consider custom kernel
+  for (size_t i=0; i<sequence_length; ++i) {
+    LocalMat input_sequence_grad_workspace_view(
+      input_size,
+      mini_batch_size,
+      input_sequence_grad_workspace.LockedBuffer(0, i),
+      input_size);
+    input_sequence_grad_workspace_view.SetSyncInfo(sync_info);
+    auto input_sequence_grad_view
+      = local_input_sequence_grad(El::IR(i*input_size, (i+1)*input_size), El::ALL);
+    El::Copy(input_sequence_grad_workspace_view, input_sequence_grad_view);
+  }
+
+}
+#endif // LBANN_HAS_CUDNN
+
+// ---------------------------------------------
+// Builder
+// ---------------------------------------------
+
+namespace
+{
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+struct Builder
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&...)
+  {
+    LBANN_ERROR(
+      "Attempted to construct gru_layer with invalid parameters ",
+      "(TensorDataType=",TypeName<TensorDataType>(),", ",
+      "Layout=",to_string(Layout),", ",
+      "Device=",to_string(Device),")");
+    return nullptr;
+  }
+};
+
+#ifdef LBANN_HAS_CUDNN
+template <typename TensorDataType>
+struct Builder<TensorDataType,data_layout::DATA_PARALLEL,El::Device::GPU>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    constexpr auto Layout = data_layout::DATA_PARALLEL;
+    constexpr auto Device = El::Device::GPU;
+    using LayerType = gru_layer<TensorDataType,Layout,Device>;
+    return make_unique<LayerType>(std::forward<Args>(args)...);
+  }
+};
+#endif // LBANN_HAS_CUDNN
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_gru_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer)
+{
+  using BuilderType = Builder<TensorDataType, Layout, Device>;
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, gru);
+  const auto& params = proto_layer.gru();
+  return BuilderType::Build(comm, params.hidden_size());
+}
+
+// ---------------------------------------------
+// Explicit template instantiation
+// ---------------------------------------------
+
+/// @todo CPU implementation
+#ifdef LBANN_HAS_CUDNN
+#define PROTO(T)                                                        \
+  template class gru_layer<                                             \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#endif // LBANN_HAS_CUDNN
+
+#define PROTO_DEVICE(T, Device)                 \
+  LBANN_LAYER_BUILDER_ETI(gru, T, Device)
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+} // namespace lbann
diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp
index 6d9d52899b4..6a1ef565820 100644
--- a/src/proto/factories/layer_factory.cpp
+++ b/src/proto/factories/layer_factory.cpp
@@ -48,6 +48,7 @@
 #include "lbann/layers/learning/embedding.hpp"
 #include "lbann/layers/learning/entrywise_scale_bias.hpp"
 #include "lbann/layers/learning/fully_connected.hpp"
+#include "lbann/layers/learning/gru.hpp"
 #include "lbann/layers/learning/learning.hpp"
 #include "lbann/layers/loss/categorical_accuracy.hpp"
 #include "lbann/layers/loss/cross_entropy.hpp"
@@ -167,6 +168,7 @@ class factory_manager
     LBANN_REGISTER_BUILDER(Embedding, embedding);
     LBANN_REGISTER_BUILDER(EntrywiseScaleBias, entrywise_scale_bias);
     LBANN_REGISTER_BUILDER(FullyConnected, fully_connected);
+    LBANN_REGISTER_BUILDER(GRU, gru);
 
     // Math layers
     LBANN_REGISTER_DEFAULT_BUILDER(Abs, abs);
diff --git a/src/proto/layers.proto b/src/proto/layers.proto
index be5f2883155..9ccb656499a 100644
--- a/src/proto/layers.proto
+++ b/src/proto/layers.proto
@@ -103,6 +103,7 @@ message Layer {
     ChannelwiseScaleBias channelwise_scale_bias = 329;
     EntrywiseScaleBias entrywise_scale_bias = 330;
     ChannelwiseFullyConnected channelwise_fully_connected = 331;
+    GRU gru = 333;
 
     // Loss layers
     CrossEntropy cross_entropy = 60;
@@ -662,6 +663,28 @@ message Layer {
     google.protobuf.BoolValue transpose = 3;
   }
 
+  /** @brief Gated recurrent unit
+   *
+   *  Expects two inputs: a 2D input sequence (
+   *  @f$ \text{sequence\_length}\times\text{input\_size} @f$ )
+   *  and a 1D initial hidden state ( @f$ \text{hidden\_size} @f$ ).
+   *
+   *  Uses four weights: "ih\_matrix" (
+   *  @f$ 3 \text{hidden\_size}\times\text{input\_size} @f$ ),
+   *  "hh\_matrix" (
+   *  @f$ 3 \text{hidden\_size}\times\text{hidden\_size} @f$ ),
+   *  "ih_bias" ( @f$ 3 \text{hidden\_size} @f$ ),
+   *  "hh_bias" ( @f$ 3 \text{hidden\_size} @f$ ).
+   *
+   *  @todo Support CPU
+   *  @todo Support bidirectional RNNs
+   *  @todo Support stacked RNNs
+   */
+  message GRU {
+    /// Size of each hidden state and output vector
+    uint64 hidden_size = 1;
+  }
+
   //////////////////
   // Image layers //
   //////////////////
diff --git a/src/utils/cudnn.cpp b/src/utils/cudnn.cpp
index 18724b6bc24..3f28a5e8c45 100644
--- a/src/utils/cudnn.cpp
+++ b/src/utils/cudnn.cpp
@@ -241,6 +241,446 @@ void copy_activation_desc(const cudnnActivationDescriptor_t& src,
 
 }
 
+////////////////////////////////////////////////////////////
+// Wrapper classes for cuDNN types
+////////////////////////////////////////////////////////////
+
+// -----------------------------
+// TensorDescriptor
+// -----------------------------
+
+TensorDescriptor::TensorDescriptor(cudnnTensorDescriptor_t desc)
+  : desc_{desc}
+{}
+
+TensorDescriptor::~TensorDescriptor() {
+  if (desc_) {
+    // Don't check status to avoid exceptions
+    cudnnDestroyTensorDescriptor(desc_);
+  }
+}
+
+TensorDescriptor::TensorDescriptor(const TensorDescriptor& other) {
+  if (other.desc_) {
+    cudnnDataType_t data_type;
+    int num_dims;
+    CHECK_CUDNN(
+      cudnnGetTensorNdDescriptor(
+        other.desc_,
+        0,          // nbDimsRequested
+        &data_type,
+        &num_dims,
+        nullptr,    // dimA
+        nullptr));  // strideA
+    std::vector<int> dims(num_dims), strides(num_dims);
+    CHECK_CUDNN(
+      cudnnGetTensorNdDescriptor(
+        other.desc_,
+        num_dims,
+        &data_type,
+        &num_dims,
+        dims.data(),
+        strides.data()));
+    set(data_type, dims, strides);
+  }
+}
+
+TensorDescriptor::TensorDescriptor(TensorDescriptor&& other)
+  : desc_{other.desc_} {
+  other.desc_ = nullptr;
+}
+
+TensorDescriptor& TensorDescriptor::operator=(TensorDescriptor other) {
+  swap(other, *this);
+  return *this;
+}
+
+void swap(TensorDescriptor& first, TensorDescriptor& second) {
+  std::swap(first.desc_, second.desc_);
+}
+
+void TensorDescriptor::reset(cudnnTensorDescriptor_t desc) {
+  if (desc_) {
+    CHECK_CUDNN(cudnnDestroyTensorDescriptor(desc_));
+  }
+  desc_ = desc;
+}
+
+cudnnTensorDescriptor_t TensorDescriptor::release() {
+  auto old_desc = desc_;
+  desc_ = nullptr;
+  return old_desc;
+}
+
+cudnnTensorDescriptor_t TensorDescriptor::get() const noexcept {
+  return desc_;
+}
+
+TensorDescriptor::operator cudnnTensorDescriptor_t() const noexcept {
+  return get();
+}
+
+void TensorDescriptor::create() {
+  if (!desc_) {
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&desc_));
+  }
+}
+
+void TensorDescriptor::set(
+  cudnnDataType_t data_type,
+  const std::vector<int>& dims,
+  std::vector<int> strides) {
+
+  // Check that arguments are valid
+  if (dims.empty()) {
+    LBANN_ERROR("attempted to set cuDNN tensor descriptor with no dimensions");
+  }
+  if (dims.size() < 3) {
+    // As of cuDNN 7.65, cuDNN does not support tensors with <3 dims
+    LBANN_ERROR(
+      "attempted to set cuDNN tensor descriptor with fewer than 3 dimensions");
+  }
+  if (!strides.empty() && dims.size() != strides.size()) {
+    LBANN_ERROR(
+      "attempted to set cuDNN tensor descriptor ",
+      "with mismatched dimensions (",dims.size(),") ",
+      "and strides (",strides.size(),")");
+  }
+
+  // Assume data is contiguous if no strides are provided
+  if (strides.empty()) {
+    strides.resize(dims.size(), 1);
+    for (int i=strides.size()-1; i>0; --i) {
+      strides[i-1] = strides[i] * dims[i];
+    }
+  }
+
+  // Set cuDNN object
+  create();
+  CHECK_CUDNN(
+    cudnnSetTensorNdDescriptor(
+      desc_,
+      data_type,
+      dims.size(),
+      dims.data(),
+      strides.data()));
+
+}
+
+// -----------------------------
+// FilterDescriptor
+// -----------------------------
+
+FilterDescriptor::FilterDescriptor(cudnnFilterDescriptor_t desc)
+  : desc_{desc}
+{}
+
+FilterDescriptor::~FilterDescriptor() {
+  if (desc_) {
+    // Don't check status to avoid exceptions
+    cudnnDestroyFilterDescriptor(desc_);
+  }
+}
+
+FilterDescriptor::FilterDescriptor(const FilterDescriptor& other) {
+  if (other.desc_) {
+    int num_dims;
+    cudnnDataType_t data_type;
+    cudnnTensorFormat_t format;
+    CHECK_CUDNN(
+      cudnnGetFilterNdDescriptor(
+        other.desc_,
+        0,          // nbDimsRequested
+        &data_type,
+        &format,
+        &num_dims,
+        nullptr));  // filterDimA
+    std::vector<int> dims(num_dims);
+    CHECK_CUDNN(
+      cudnnGetFilterNdDescriptor(
+        other.desc_,
+        num_dims,
+        &data_type,
+        &format,
+        &num_dims,
+        dims.data()));
+    set(data_type, format, dims);
+  }
+}
+
+FilterDescriptor::FilterDescriptor(FilterDescriptor&& other)
+  : desc_{other.desc_} {
+  other.desc_ = nullptr;
+}
+
+FilterDescriptor& FilterDescriptor::operator=(FilterDescriptor other) {
+  swap(other, *this);
+  return *this;
+}
+
+void swap(FilterDescriptor& first, FilterDescriptor& second) {
+  std::swap(first.desc_, second.desc_);
+}
+
+void FilterDescriptor::reset(cudnnFilterDescriptor_t desc) {
+  if (desc_) {
+    CHECK_CUDNN(cudnnDestroyFilterDescriptor(desc_));
+  }
+  desc_ = desc;
+}
+
+cudnnFilterDescriptor_t FilterDescriptor::release() {
+  auto old_desc = desc_;
+  desc_ = nullptr;
+  return old_desc;
+}
+
+cudnnFilterDescriptor_t FilterDescriptor::get() const noexcept {
+  return desc_;
+}
+
+FilterDescriptor::operator cudnnFilterDescriptor_t() const noexcept {
+  return get();
+}
+
+void FilterDescriptor::create() {
+  if (!desc_) {
+    CHECK_CUDNN(cudnnCreateFilterDescriptor(&desc_));
+  }
+}
+
+void FilterDescriptor::set(
+  cudnnDataType_t data_type,
+  cudnnTensorFormat_t format,
+  const std::vector<int>& dims) {
+  create();
+  CHECK_CUDNN(
+    cudnnSetFilterNdDescriptor(
+      desc_,
+      data_type,
+      format,
+      dims.size(),
+      dims.data()));
+}
+
+// -----------------------------
+// DropoutDescriptor
+// -----------------------------
+
+DropoutDescriptor::DropoutDescriptor(cudnnDropoutDescriptor_t desc)
+  : desc_{desc}
+{}
+
+DropoutDescriptor::~DropoutDescriptor() {
+  if (desc_) {
+    // Don't check status to avoid exceptions
+    cudnnDestroyDropoutDescriptor(desc_);
+  }
+}
+
+DropoutDescriptor::DropoutDescriptor(const DropoutDescriptor& other) {
+  if (other.desc_) {
+    float dropout;
+    void* states;
+    size_t states_size;
+    unsigned long long seed;
+    CHECK_CUDNN(cudnnDropoutGetStatesSize(get_handle(), &states_size));
+    CHECK_CUDNN(
+      cudnnGetDropoutDescriptor(
+        other.desc_,
+        get_handle(),
+        &dropout,
+        &states,
+        &seed));
+    set(dropout, states, states_size, seed);
+  }
+}
+
+DropoutDescriptor::DropoutDescriptor(DropoutDescriptor&& other)
+  : desc_{other.desc_} {
+  other.desc_ = nullptr;
+}
+
+DropoutDescriptor& DropoutDescriptor::operator=(DropoutDescriptor other) {
+  swap(other, *this);
+  return *this;
+}
+
+void swap(DropoutDescriptor& first, DropoutDescriptor& second) {
+  std::swap(first.desc_, second.desc_);
+}
+
+void DropoutDescriptor::reset(cudnnDropoutDescriptor_t desc) {
+  if (desc_) {
+    CHECK_CUDNN(cudnnDestroyDropoutDescriptor(desc_));
+  }
+  desc_ = desc;
+}
+
+cudnnDropoutDescriptor_t DropoutDescriptor::release() {
+  auto old_desc = desc_;
+  desc_ = nullptr;
+  return old_desc;
+}
+
+cudnnDropoutDescriptor_t DropoutDescriptor::get() const noexcept {
+  return desc_;
+}
+
+DropoutDescriptor::operator cudnnDropoutDescriptor_t() const noexcept {
+  return get();
+}
+
+void DropoutDescriptor::create() {
+  if (!desc_) {
+    CHECK_CUDNN(cudnnCreateDropoutDescriptor(&desc_));
+  }
+}
+
+void DropoutDescriptor::set(
+  float dropout,
+  void* states,
+  size_t states_size,
+  unsigned long long seed) {
+  create();
+  CHECK_CUDNN(
+    cudnnSetDropoutDescriptor(
+      desc_,
+      get_handle(),
+      dropout,
+      states,
+      states_size,
+      seed));
+}
+
+// -----------------------------
+// RNNDescriptor
+// -----------------------------
+
+RNNDescriptor::RNNDescriptor(cudnnRNNDescriptor_t desc)
+  : desc_{desc}
+{}
+
+RNNDescriptor::~RNNDescriptor() {
+  if (desc_) {
+    // Don't check status to avoid exceptions
+    cudnnDestroyRNNDescriptor(desc_);
+  }
+}
+
+RNNDescriptor::RNNDescriptor(const RNNDescriptor& other) {
+  if (other.desc_) {
+    int hidden_size, num_layers;
+    cudnnDropoutDescriptor_t dropout_desc;
+    cudnnRNNInputMode_t input_mode;
+    cudnnDirectionMode_t direction;
+    cudnnRNNMode_t mode;
+    cudnnRNNAlgo_t algo;
+    cudnnDataType_t math_precision;
+#if CUDNN_VERSION >= 8000
+    CHECK_CUDNN(
+      cudnnGetRNNDescriptor_v6(
+        get_handle(),
+        other.desc_,
+        &hidden_size,
+        &num_layers,
+        &dropout_desc,
+        &input_mode,
+        &direction,
+        &mode,
+        &algo,
+        &math_precision));
+#else // CUDNN_VERSION < 8000
+    CHECK_CUDNN(
+      cudnnGetRNNDescriptor(
+        get_handle(),
+        other.desc_,
+        &hidden_size,
+        &num_layers,
+        &dropout_desc,
+        &input_mode,
+        &direction,
+        &mode,
+        &algo,
+        &math_precision));
+#endif // CUDNN_VERSION >= 8000
+    set(
+      hidden_size,
+      num_layers,
+      dropout_desc,
+      input_mode,
+      direction,
+      mode,
+      algo,
+      math_precision);
+  }
+}
+
+RNNDescriptor::RNNDescriptor(RNNDescriptor&& other)
+  : desc_{other.desc_} {
+  other.desc_ = nullptr;
+}
+
+RNNDescriptor& RNNDescriptor::operator=(RNNDescriptor other) {
+  swap(other, *this);
+  return *this;
+}
+
+void swap(RNNDescriptor& first, RNNDescriptor& second) {
+  std::swap(first.desc_, second.desc_);
+}
+
+void RNNDescriptor::reset(cudnnRNNDescriptor_t desc) {
+  if (desc_) {
+    CHECK_CUDNN(cudnnDestroyRNNDescriptor(desc_));
+  }
+  desc_ = desc;
+}
+
+cudnnRNNDescriptor_t RNNDescriptor::release() {
+  auto old_desc = desc_;
+  desc_ = nullptr;
+  return old_desc;
+}
+
+cudnnRNNDescriptor_t RNNDescriptor::get() const noexcept {
+  return desc_;
+}
+
+RNNDescriptor::operator cudnnRNNDescriptor_t() const noexcept {
+  return get();
+}
+
+void RNNDescriptor::create() {
+  if (!desc_) {
+    CHECK_CUDNN(cudnnCreateRNNDescriptor(&desc_));
+  }
+}
+
+void RNNDescriptor::set(
+  size_t hidden_size,
+  size_t num_layers,
+  cudnnDropoutDescriptor_t dropout_desc,
+  cudnnRNNInputMode_t input_mode,
+  cudnnDirectionMode_t direction,
+  cudnnRNNMode_t mode,
+  cudnnRNNAlgo_t algo,
+  cudnnDataType_t math_precision) {
+  create();
+  CHECK_CUDNN(
+    cudnnSetRNNDescriptor_v6(
+      get_handle(),
+      desc_,
+      hidden_size,
+      num_layers,
+      dropout_desc,
+      input_mode,
+      direction,
+      mode,
+      algo,
+      math_precision));
+}
+
 ////////////////////////////////////////////////////////////
 // Base cuDNN tensor manager
 ////////////////////////////////////////////////////////////

From d7e5c58a19bb621bcc5441f5b8dc92718de55fe5 Mon Sep 17 00:00:00 2001
From: Sam Ade Jacobs <jacobs32@llnl.gov>
Date: Tue, 15 Sep 2020 13:43:00 -0700
Subject: [PATCH 27/36] ATOM VAE (#1628)

* draft implementation of ATOM VAE

* VAE draft

* VAE draft

* Add smaller (10K) dataset, and model cleanup

* Add smaller (10K) dataset, and model cleanup

* Add filename and filedir to arg

* More args added to streamline large scale experiments

* Add encoder tag to VAE model

* More argument parsers to streamline experiments
---
 applications/ATOM/models/vae.py     |  4 ++--
 applications/ATOM/train_atom_vae.py | 17 +++++++++++++----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/applications/ATOM/models/vae.py b/applications/ATOM/models/vae.py
index f7a6e180cfd..2090a026114 100644
--- a/applications/ATOM/models/vae.py
+++ b/applications/ATOM/models/vae.py
@@ -131,8 +131,8 @@ def __init__(self, input_feature_dims,dictionary_size, embedding_size, ignore_la
             datatype=self.datatype,
             weights_datatype=self.weights_datatype,
         )
-        self.q_mu = fc(128,name=self.name+'_qmu')
-        self.q_logvar = fc(128,name=self.name+'_qlogvar')
+        self.q_mu = fc(128,name=self.name+'_encoder_qmu')
+        self.q_logvar = fc(128,name=self.name+'_encoder_qlogvar')
         for w in self.q_mu.weights + self.q_logvar.weights:
             w.datatype = self.weights_datatype
 
diff --git a/applications/ATOM/train_atom_vae.py b/applications/ATOM/train_atom_vae.py
index 1adf144e9bf..e5875e149f3 100644
--- a/applications/ATOM/train_atom_vae.py
+++ b/applications/ATOM/train_atom_vae.py
@@ -7,13 +7,15 @@
 from google.protobuf import text_format as txtf
 import json
 import numpy as np
-import vae as molvae
+import models.vae as molvae
 
 import lbann
 import lbann.contrib.launcher
 import lbann.modules
 from lbann.util import str_list
 
+def list2str(l):
+    return ' '.join(l)
 
 def construct_lc_launcher_args():
 
@@ -60,6 +62,7 @@ def construct_lc_launcher_args():
     parser.add_argument("--no-header", type=bool, default=True)
     parser.add_argument("--ltfb", type=bool, default=False)
     parser.add_argument("--ltfb-batch-interval", type=int, default=100)
+    parser.add_argument("--weights-to-send", type=str, default='')
 
     # these are specific to the Trainer object
     parser.add_argument(
@@ -129,11 +132,17 @@ def construct_model(run_args):
                 ]
 
     callbacks = [lbann.CallbackPrint(),
-                 lbann.CallbackTimer(),
-                 lbann.CallbackDumpWeights(directory=run_args.dump_weights_dir, epoch_interval=run_args.dump_weights_interval)]
+                 lbann.CallbackTimer()]
 
+    if(run_args.dump_weights_interval > 0):
+      callbacks.append(lbann.CallbackDumpWeights(directory=run_args.dump_weights_dir, 
+                                              epoch_interval=run_args.dump_weights_interval))
     if(run_args.ltfb):
+      send_name = ('' if run_args.weights_to_send == 'All' else run_args.weights_to_send) #hack for Merlin empty string
+      weights_to_ex = [w.name for w in weights if send_name in w.name]
+      print("LTFB Weights to exchange ", weights_to_ex)
       callbacks.append(lbann.CallbackLTFB(batch_interval=run_args.ltfb_batch_interval,metric='recon',
+                                          weights = list2str(weights_to_ex),
                                           low_score_wins=True,exchange_hyperparameters=True))
     # Construct model
     return lbann.Model(run_args.num_epochs,
@@ -250,7 +259,7 @@ def main():
         nodes=run_args.nodes,
         procs_per_node=ppn,
         #batch_job = True,
-        setup_only = True,
+        #setup_only = True,
         job_name=run_args.job_name,
         experiment_dir=experiment_dir,
         lbann_args = m_lbann_args,

From 3bff6a8e6767b56ea63850738fd1eeb5494684b5 Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Tue, 15 Sep 2020 16:27:55 -0700
Subject: [PATCH 28/36] Optimizations for ATOM VAE model (#1627)

* Enable tensor cores in ATOM VAE model

* Skip bad gradients in Adam
---
 applications/ATOM/models/vae.py     | 2 +-
 applications/ATOM/train_atom_vae.py | 4 ++++
 include/lbann/utils/cuda.hpp        | 3 +++
 include/lbann/utils/impl/cuda.hpp   | 6 ++++--
 src/layers/learning/gru.cpp         | 4 ++++
 src/optimizers/adam.cpp             | 6 ++++++
 src/optimizers/adam.cu              | 6 ++++++
 7 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/applications/ATOM/models/vae.py b/applications/ATOM/models/vae.py
index 2090a026114..066e2c78e10 100644
--- a/applications/ATOM/models/vae.py
+++ b/applications/ATOM/models/vae.py
@@ -118,7 +118,7 @@ def __init__(self, input_feature_dims,dictionary_size, embedding_size, ignore_la
         self.embedding_size = embedding_size
         self.dictionary_size = dictionary_size
         self.label_to_ignore = ignore_label
-        self.datatype = lbann.DataType.FLOAT
+        self.datatype = lbann.DataType.FP16
         self.weights_datatype = lbann.DataType.FLOAT
 
         fc = lbann.modules.FullyConnectedModule
diff --git a/applications/ATOM/train_atom_vae.py b/applications/ATOM/train_atom_vae.py
index e5875e149f3..15e36270de7 100644
--- a/applications/ATOM/train_atom_vae.py
+++ b/applications/ATOM/train_atom_vae.py
@@ -263,6 +263,10 @@ def main():
         job_name=run_args.job_name,
         experiment_dir=experiment_dir,
         lbann_args = m_lbann_args,
+        environment = {
+            'LBANN_USE_CUBLAS_TENSOR_OPS' : 1,
+            'LBANN_USE_CUDNN_TENSOR_OPS' : 1,
+        },
     )
 
     print("LBANN launcher status:\n" + str(status))
diff --git a/include/lbann/utils/cuda.hpp b/include/lbann/utils/cuda.hpp
index 4168119156a..9778dfa7dcf 100644
--- a/include/lbann/utils/cuda.hpp
+++ b/include/lbann/utils/cuda.hpp
@@ -167,6 +167,9 @@ template <typename T> __device__ __forceinline__ T tanh(const T& x);
 template <typename T> __device__ __forceinline__ T acosh(const T& x);
 template <typename T> __device__ __forceinline__ T asinh(const T& x);
 template <typename T> __device__ __forceinline__ T atanh(const T& x);
+template <typename T> __device__ __forceinline__ bool isfinite(const T& x);
+template <typename T> __device__ __forceinline__ bool isinf(const T& x);
+template <typename T> __device__ __forceinline__ bool isnan(const T& x);
 
 // Binary math functions
 template <typename T> __device__ __forceinline__ T min(const T& x, const T& y);
diff --git a/include/lbann/utils/impl/cuda.hpp b/include/lbann/utils/impl/cuda.hpp
index cb83e1e83c5..2517c5ea7b3 100644
--- a/include/lbann/utils/impl/cuda.hpp
+++ b/include/lbann/utils/impl/cuda.hpp
@@ -178,14 +178,16 @@ WRAP_UNARY_CUDA_MATH_FUNCTION(atanh)
 
 template <typename T> __device__ __forceinline__
 bool isfinite(T const& x) { return ::isfinite(x); }
-
+template <typename T> __device__ __forceinline__
+bool isinf(T const& x) { return ::isinf(x); }
 template <typename T> __device__ __forceinline__
 bool isnan(T const& x) { return ::isnan(x); }
 
 #if __CUDA_ARCH__ >= 530
 template <> __device__ __forceinline__
 bool isfinite(__half const& x) { return !(::__isnan(x) || ::__hisinf(x)); }
-
+template <> __device__ __forceinline__
+bool isinf(__half const& x) { return ::__hisinf(x); }
 template <> __device__ __forceinline__
 bool isnan(__half const& x) { return ::__hisnan(x); }
 
diff --git a/src/layers/learning/gru.cpp b/src/layers/learning/gru.cpp
index a6df3666f5c..96176c26eb7 100644
--- a/src/layers/learning/gru.cpp
+++ b/src/layers/learning/gru.cpp
@@ -217,6 +217,10 @@ void gru_layer<TensorDataType, Layout, Device>::setup_gpu() {
     CUDNN_GRU,
     CUDNN_RNN_ALGO_STANDARD,
     data_type);
+  CHECK_CUDNN(
+    cudnnSetRNNMatrixMathType(
+      m_rnn_cudnn_desc,
+      cudnn::get_default_convolution_math_type()));
 
   // Input and output tensor descriptors
   m_input_cudnn_desc.set(data_type, 1, input_size, 1);
diff --git a/src/optimizers/adam.cpp b/src/optimizers/adam.cpp
index 551920b4f94..8bb89b5a31e 100644
--- a/src/optimizers/adam.cpp
+++ b/src/optimizers/adam.cpp
@@ -158,6 +158,9 @@ void adam<TensorDataType>::step_compute_cpu(AbsDistMatrixType& values,
     for (size_t i = 0; i < local_size; ++i) {
       auto& x = values_buffer[i];
       const auto& g = gradient_buffer[i] + m_eps; // Avoid denormalized floats
+      if (std::isinf(g) || std::isnan(g)) {
+        continue;
+      }
       auto& m1 = moment1_buffer[i];
       auto& m2 = moment2_buffer[i];
       m1 = m_beta1 * m1 + (one - m_beta1) * g;
@@ -177,6 +180,9 @@ void adam<TensorDataType>::step_compute_cpu(AbsDistMatrixType& values,
       for (size_t row = 0; row < local_height; ++row) {
         auto& x = values_buffer[row+col*values_ldim];
         const auto& g = gradient_buffer[row+col*gradient_ldim] + m_eps; // Avoid denormalized floats
+        if (std::isinf(g) || std::isnan(g)) {
+          continue;
+        }
         auto& m1 = moment1_buffer[row+col*moment1_ldim];
         auto& m2 = moment2_buffer[row+col*moment2_ldim];
         m1 = m_beta1 * m1 + (one - m_beta1) * g;
diff --git a/src/optimizers/adam.cu b/src/optimizers/adam.cu
index ac12ebf38c1..f4e67d3be0a 100644
--- a/src/optimizers/adam.cu
+++ b/src/optimizers/adam.cu
@@ -50,6 +50,9 @@ __global__ void adam_noncontiguous_kernel(size_t height,
     const auto& row = gid % height;
     const auto& col = gid / height;
     const auto& g = gradient[row + col * gradient_ldim] + eps;
+    if (cuda::isinf(g) || cuda::isnan(g)) {
+      return;
+    }
     auto& m1 = moment1[row + col * moment1_ldim];
     auto& m2 = moment2[row + col * moment2_ldim];
     auto& x = values[row + col * values_ldim];
@@ -72,6 +75,9 @@ __global__ void adam_contiguous_kernel(size_t size,
   const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
   if (gid < size) {
     const auto& g = gradient[gid] + eps;
+    if (cuda::isinf(g) || cuda::isnan(g)) {
+      return;
+    }
     auto& m1 = moment1[gid];
     auto& m2 = moment2[gid];
     auto& x = values[gid];

From 9faec4575c2422a1b4c20b87948513560a2d4d3f Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Thu, 17 Sep 2020 15:55:03 -0700
Subject: [PATCH 29/36] CUDA kernel for tensor reordering in GRU layer (#1629)

* Optimize CUDA kernel for tensor reordering in GRU layer

* Improve numerical stability of ATOM VAE recon loss

* Change datatype in ATOM VAE model to fp32

* Remove safe divide in ATOM VAE model
---
 applications/ATOM/models/vae.py |  56 +++++++-----
 include/lbann/utils/cuda.hpp    |  13 ++-
 src/layers/learning/gru.cpp     | 150 ++++++++++++++------------------
 src/utils/cuda.cu               | 149 ++++++++++++++++++++++++++++++-
 4 files changed, 256 insertions(+), 112 deletions(-)

diff --git a/applications/ATOM/models/vae.py b/applications/ATOM/models/vae.py
index 066e2c78e10..490f156f872 100644
--- a/applications/ATOM/models/vae.py
+++ b/applications/ATOM/models/vae.py
@@ -118,7 +118,7 @@ def __init__(self, input_feature_dims,dictionary_size, embedding_size, ignore_la
         self.embedding_size = embedding_size
         self.dictionary_size = dictionary_size
         self.label_to_ignore = ignore_label
-        self.datatype = lbann.DataType.FP16
+        self.datatype = lbann.DataType.FLOAT
         self.weights_datatype = lbann.DataType.FLOAT
 
         fc = lbann.modules.FullyConnectedModule
@@ -230,13 +230,16 @@ def forward_encoder(self, x_emb):
         z = lbann.Add([mu, (lbann.Multiply([lbann.Exp(lbann.WeightedSum(logvar,scaling_factors='0.5')),eps]))])
 
         # kl_loss = 0.5 * (logvar.exp() + mu ** 2 - 1 - logvar).sum(1).mean()
-        kl_loss = lbann.Reduction(lbann.WeightedSum(
-                                        [lbann.Exp(logvar),
-                                        lbann.Square(mu),
-                                        lbann.Constant(value=1.0, hint_layer=mu),
-                                        logvar],
-                                        scaling_factors='0.5 0.5 -0.5 -0.5'),
-                                        mode='sum')
+        kl_loss = lbann.Reduction(
+            lbann.WeightedSum(
+                lbann.Exp(logvar),
+                lbann.Square(mu),
+                self.constant(1, hint_layer=mu),
+                logvar,
+                scaling_factors='0.5 0.5 -0.5 -0.5',
+            ),
+            mode='sum',
+        )
 
         return z, kl_loss
 
@@ -307,17 +310,14 @@ def compute_loss(self, x, y):
         # Note: Ignored indices result in zero vectors
         ignore_mask = lbann.Equal(
             x,
-            lbann.Constant(value=self.label_to_ignore, hint_layer=x),
+            self.constant(self.label_to_ignore, hint_layer=x),
         )
         keep_mask = lbann.LogicalNot(ignore_mask)
         length = lbann.Reduction(keep_mask, mode='sum')
-        length = lbann.Max(
-            length,
-            lbann.Constant(value=1, num_neurons=str_list([1])),
-        )
+        length = lbann.Max(length, self.constant(1, [1]))
         x = lbann.Add(
             lbann.Multiply(keep_mask, x),
-            lbann.Multiply(ignore_mask, lbann.Constant(value=-1, hint_layer=x)),
+            lbann.Multiply(ignore_mask, self.constant(-1, hint_layer=x)),
         )
         x = lbann.Slice(x, slice_points=str_list(range(self.input_feature_dims)))
         x = [lbann.Identity(x) for _ in range(self.input_feature_dims-1)]
@@ -330,12 +330,18 @@ def compute_loss(self, x, y):
         #     x[:, 1:].contiguous().view(-1),
         #     ignore_index=self.pad
         # )
+        # Note: Ideally we'd shift y by y.max(-1) for numerical stability
+        shifts = lbann.MatMul(
+            lbann.Max(y, self.constant(0, hint_layer=y)),
+            self.constant(
+                1 / math.sqrt(self.dictionary_size),
+                [self.dictionary_size, self.dictionary_size],
+            ),
+        )
+        y = lbann.Subtract(y, shifts)
         z = lbann.MatMul(
             lbann.Exp(y),
-            lbann.Constant(
-                value=1,
-                num_neurons=str_list([self.dictionary_size, 1]),
-            ),
+            self.constant(1, [self.dictionary_size, 1]),
         )
         z = lbann.Log(z)
         z = lbann.MatMul(
@@ -343,12 +349,20 @@ def compute_loss(self, x, y):
             z,
         )
         recon_loss = lbann.MatMul(
-            lbann.Reshape(y, dims=str_list([-1, 1])),
-            lbann.Reshape(x, dims=str_list([-1, 1])),
-            transpose_a=True,
+            lbann.Reshape(y, dims=str_list([1, -1])),
+            lbann.Reshape(x, dims=str_list([1, -1])),
+            transpose_b=True,
         )
         recon_loss = lbann.Subtract(z, recon_loss)
         recon_loss = lbann.Reshape(recon_loss, dims=str_list([1]))
         recon_loss = lbann.Divide(recon_loss, length)
 
         return recon_loss
+
+    def constant(self, value, dims=[], datatype=None, hint_layer=None):
+        return lbann.Constant(
+            value=value,
+            num_neurons=str_list(dims),
+            datatype=datatype,
+            hint_layer=hint_layer,
+        )
diff --git a/include/lbann/utils/cuda.hpp b/include/lbann/utils/cuda.hpp
index 9778dfa7dcf..de2cfb213fa 100644
--- a/include/lbann/utils/cuda.hpp
+++ b/include/lbann/utils/cuda.hpp
@@ -225,8 +225,9 @@ class event_wrapper {
 };
 
 // -------------------------------------------------------------
-// Helper functions for entrywise operations
+// Helper functions for tensor operations
 // -------------------------------------------------------------
+
 #ifdef __CUDACC__
 
 /** Apply an entry-wise unary operator to GPU data.
@@ -270,6 +271,16 @@ void apply_entrywise_binary_operator(
 
 #endif // __CUDACC__
 
+/** Copy entries between GPU tensors. */
+template <typename TensorDataType>
+void copy_tensor(
+  cudaStream_t stream,
+  const std::vector<size_t>& dims,
+  const TensorDataType* input,
+  const std::vector<size_t>& input_strides,
+  TensorDataType* output,
+  const std::vector<size_t>& output_strides);
+
 // -------------------------------------------------------------
 // Utilities for Thrust
 // -------------------------------------------------------------
diff --git a/src/layers/learning/gru.cpp b/src/layers/learning/gru.cpp
index 96176c26eb7..4227b6723f2 100644
--- a/src/layers/learning/gru.cpp
+++ b/src/layers/learning/gru.cpp
@@ -383,11 +383,11 @@ void fp_compute_impl(
   using ByteBuffer = hydrogen::simple_buffer<El::byte, El::Device::GPU>;
 
   // Matrices
-  const auto& local_input_sequence
+  const auto& input_sequence
     = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(0));
-  const auto& local_init_hidden
+  const auto& init_hidden
     = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(1));
-  auto& local_output_sequence
+  auto& output_sequence
     = dynamic_cast<LocalMat&>(l.get_local_activations());
   const auto& ih_matrix
     = dynamic_cast<const LocalMat&>(l.weights_values(0).LockedMatrix());
@@ -400,7 +400,7 @@ void fp_compute_impl(
 
   // Dimensions
   const size_t sequence_length = l.get_input_dims(0)[0];
-  const size_t mini_batch_size = local_input_sequence.Width();
+  const size_t mini_batch_size = input_sequence.Width();
   const size_t input_size = l.get_input_size(0) / sequence_length;
   const size_t hidden_size = l.m_hidden_size;
 
@@ -410,7 +410,8 @@ void fp_compute_impl(
   }
 
   // GPU objects
-  auto&& sync_info = local_input_sequence.GetSyncInfo();
+  auto&& sync_info = input_sequence.GetSyncInfo();
+  auto&& stream = sync_info.Stream();
   auto&& handle = cudnn::get_handle();
   const auto data_type = cudnn::get_data_type<TensorDataType>();
 
@@ -427,23 +428,19 @@ void fp_compute_impl(
 
   // Reorder input tensor dims
   // Note: cuDNN uses sequence_length x mini_batch_size x hidden_size
-  /// @todo Consider custom kernel
   LocalMat input_sequence_workspace, output_sequence_workspace;
   input_sequence_workspace.SetSyncInfo(sync_info);
   output_sequence_workspace.SetSyncInfo(sync_info);
   input_sequence_workspace.Resize(mini_batch_size*input_size, sequence_length);
   output_sequence_workspace.Resize(mini_batch_size*hidden_size, sequence_length);
-  for (size_t i=0; i<sequence_length; ++i) {
-    const auto input_sequence_view
-      = local_input_sequence(El::IR(i*input_size, (i+1)*input_size), El::ALL);
-    LocalMat input_sequence_workspace_view(
-      input_size,
-      mini_batch_size,
-      input_sequence_workspace.Buffer(0, i),
-      input_size);
-    input_sequence_workspace_view.SetSyncInfo(sync_info);
-    El::Copy(input_sequence_view, input_sequence_workspace_view);
-  }
+  constexpr size_t one{1};
+  cuda::copy_tensor(
+    stream,
+    {mini_batch_size, sequence_length, input_size},
+    input_sequence.LockedBuffer(),
+    {static_cast<size_t>(input_sequence.LDim()), input_size, one},
+    input_sequence_workspace.Buffer(),
+    {input_size, mini_batch_size*input_size, one});
 
   // Pack weights into workspace buffer
   auto packed_weights = pack_cudnn_rnn_weights(
@@ -488,7 +485,7 @@ void fp_compute_impl(
       input_desc_list.data(),
       input_sequence_workspace.LockedBuffer(),
       hidden_desc,
-      local_init_hidden.LockedBuffer(),
+      init_hidden.LockedBuffer(),
       hidden_desc,  // cxDesc
       nullptr,      // cx
       l.m_packed_weights_cudnn_desc,
@@ -506,18 +503,13 @@ void fp_compute_impl(
 
   // Reorder output tensor dims
   // Note: cuDNN uses sequence_length x mini_batch_size x hidden_size
-  /// @todo Consider custom kernel
-  for (size_t i=0; i<sequence_length; ++i) {
-    LocalMat output_sequence_workspace_view(
-      hidden_size,
-      mini_batch_size,
-      output_sequence_workspace.LockedBuffer(0, i),
-      hidden_size);
-    output_sequence_workspace_view.SetSyncInfo(sync_info);
-    auto output_sequence_view
-      = local_output_sequence(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL);
-    El::Copy(output_sequence_workspace_view, output_sequence_view);
-  }
+  cuda::copy_tensor(
+    stream,
+    {mini_batch_size, sequence_length, hidden_size},
+    output_sequence_workspace.LockedBuffer(),
+    {hidden_size, mini_batch_size*hidden_size, one},
+    output_sequence.Buffer(),
+    {static_cast<size_t>(output_sequence.LDim()), hidden_size, one});
 
 }
 #endif // LBANN_HAS_CUDNN
@@ -642,17 +634,17 @@ void bp_compute_impl(
   using ByteBuffer = hydrogen::simple_buffer<El::byte, El::Device::GPU>;
 
   // Matrices
-  const auto& local_input_sequence
+  const auto& input_sequence
     = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(0));
-  const auto& local_init_hidden
+  const auto& init_hidden
     = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(1));
-  const auto& local_output_sequence
+  const auto& output_sequence
     = dynamic_cast<const LocalMat&>(l.get_local_activations());
-  const auto& local_output_sequence_grad
+  const auto& output_sequence_grad
     = dynamic_cast<const LocalMat&>(l.get_local_prev_error_signals());
-  auto& local_input_sequence_grad
+  auto& input_sequence_grad
     = dynamic_cast<LocalMat&>(l.get_local_error_signals(0));
-  auto& local_init_hidden_grad
+  auto& init_hidden_grad
     = dynamic_cast<LocalMat&>(l.get_local_error_signals(1));
   const auto& ih_matrix
     = dynamic_cast<const LocalMat&>(l.weights_values(0).LockedMatrix());
@@ -665,12 +657,13 @@ void bp_compute_impl(
 
   // Dimensions
   const size_t sequence_length = l.get_input_dims(0)[0];
-  const size_t mini_batch_size = local_input_sequence.Width();
+  const size_t mini_batch_size = input_sequence.Width();
   const size_t input_size = l.get_input_size(0) / sequence_length;
   const size_t hidden_size = l.m_hidden_size;
 
   // GPU objects
-  auto&& sync_info = local_input_sequence.GetSyncInfo();
+  auto&& sync_info = input_sequence.GetSyncInfo();
+  auto&& stream = sync_info.Stream();
   auto&& handle = cudnn::get_handle();
 
   // Define closure to send weight gradients to optimizers
@@ -731,7 +724,6 @@ void bp_compute_impl(
 
   // Reorder tensor dims
   // Note: cuDNN uses sequence_length x mini_batch_size x size
-  /// @todo Consider custom kernel
   LocalMat input_sequence_workspace, output_sequence_workspace;
   LocalMat input_sequence_grad_workspace, output_sequence_grad_workspace;
   input_sequence_workspace.SetSyncInfo(sync_info);
@@ -742,39 +734,28 @@ void bp_compute_impl(
   output_sequence_workspace.Resize(mini_batch_size*hidden_size, sequence_length);
   input_sequence_grad_workspace.Resize(mini_batch_size*input_size, sequence_length);
   output_sequence_grad_workspace.Resize(mini_batch_size*hidden_size, sequence_length);
-  for (size_t i=0; i<sequence_length; ++i) {
-    const auto input_sequence_view
-      = local_input_sequence(El::IR(i*input_size, (i+1)*input_size), El::ALL);
-    LocalMat input_sequence_workspace_view(
-      input_size,
-      mini_batch_size,
-      input_sequence_workspace.Buffer(0, i),
-      input_size);
-    input_sequence_workspace_view.SetSyncInfo(sync_info);
-    El::Copy(input_sequence_view, input_sequence_workspace_view);
-  }
-  for (size_t i=0; i<sequence_length; ++i) {
-    const auto output_sequence_view
-      = local_output_sequence(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL);
-    LocalMat output_sequence_workspace_view(
-      hidden_size,
-      mini_batch_size,
-      output_sequence_workspace.Buffer(0, i),
-      hidden_size);
-    output_sequence_workspace_view.SetSyncInfo(sync_info);
-    El::Copy(output_sequence_view, output_sequence_workspace_view);
-  }
-  for (size_t i=0; i<sequence_length; ++i) {
-    const auto output_sequence_grad_view
-      = local_output_sequence_grad(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL);
-    LocalMat output_sequence_grad_workspace_view(
-      hidden_size,
-      mini_batch_size,
-      output_sequence_grad_workspace.Buffer(0, i),
-      hidden_size);
-    output_sequence_grad_workspace_view.SetSyncInfo(sync_info);
-    El::Copy(output_sequence_grad_view, output_sequence_grad_workspace_view);
-  }
+  constexpr size_t one{1};
+  cuda::copy_tensor(
+    stream,
+    {mini_batch_size, sequence_length, input_size},
+    input_sequence.LockedBuffer(),
+    {sequence_length*input_size, input_size, one},
+    input_sequence_workspace.Buffer(),
+    {input_size, mini_batch_size*input_size, one});
+  cuda::copy_tensor(
+    stream,
+    {mini_batch_size, sequence_length, hidden_size},
+    output_sequence.LockedBuffer(),
+    {sequence_length*hidden_size, hidden_size, one},
+    output_sequence_workspace.Buffer(),
+    {hidden_size, mini_batch_size*hidden_size, one});
+  cuda::copy_tensor(
+    stream,
+    {mini_batch_size, sequence_length, hidden_size},
+    output_sequence_grad.LockedBuffer(),
+    {sequence_length*hidden_size, hidden_size, one},
+    output_sequence_grad_workspace.Buffer(),
+    {hidden_size, mini_batch_size*hidden_size, one});
 
   // Pack weights into workspace buffer
   auto packed_weights = pack_cudnn_rnn_weights(
@@ -824,13 +805,13 @@ void bp_compute_impl(
       l.m_packed_weights_cudnn_desc,
       packed_weights.data(),
       hidden_desc,
-      local_init_hidden.LockedBuffer(),
+      init_hidden.LockedBuffer(),
       hidden_desc,  // cxDesc
       nullptr,
       input_desc_list.data(),
       input_sequence_grad_workspace.Buffer(),
       hidden_desc,
-      local_init_hidden_grad.Buffer(),
+      init_hidden_grad.Buffer(),
       hidden_desc,  // dcxDesc
       nullptr,
       cudnn_workspace.data(),
@@ -845,7 +826,7 @@ void bp_compute_impl(
       input_desc_list.data(),
       input_sequence_workspace.LockedBuffer(),
       hidden_desc,
-      local_init_hidden.LockedBuffer(),
+      init_hidden.LockedBuffer(),
       output_desc_list.data(),
       output_sequence_workspace.LockedBuffer(),
       cudnn_workspace.data(),
@@ -873,18 +854,13 @@ void bp_compute_impl(
 
   // Reorder input grad tensor dims
   // Note: cuDNN uses sequence_length x mini_batch_size x input_size
-  /// @todo Consider custom kernel
-  for (size_t i=0; i<sequence_length; ++i) {
-    LocalMat input_sequence_grad_workspace_view(
-      input_size,
-      mini_batch_size,
-      input_sequence_grad_workspace.LockedBuffer(0, i),
-      input_size);
-    input_sequence_grad_workspace_view.SetSyncInfo(sync_info);
-    auto input_sequence_grad_view
-      = local_input_sequence_grad(El::IR(i*input_size, (i+1)*input_size), El::ALL);
-    El::Copy(input_sequence_grad_workspace_view, input_sequence_grad_view);
-  }
+  cuda::copy_tensor(
+    stream,
+    {mini_batch_size, sequence_length, input_size},
+    input_sequence_grad_workspace.LockedBuffer(),
+    {input_size, mini_batch_size*input_size, one},
+    input_sequence_grad.Buffer(),
+    {sequence_length*input_size, input_size, one});
 
 }
 #endif // LBANN_HAS_CUDNN
diff --git a/src/utils/cuda.cu b/src/utils/cuda.cu
index 2838f6e1f58..dfd7f09e67e 100644
--- a/src/utils/cuda.cu
+++ b/src/utils/cuda.cu
@@ -31,9 +31,9 @@
 namespace lbann {
 namespace cuda {
 
-////////////////////////////////////////////////////////////
-// CUDA event wrapper
-////////////////////////////////////////////////////////////
+// -------------------------------------------------------------
+// Utilities for CUDA events
+// -------------------------------------------------------------
 
 event_wrapper::event_wrapper() : m_event(nullptr), m_stream(0) {
   CHECK_CUDA(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
@@ -77,6 +77,149 @@ void event_wrapper::synchronize() {
 
 cudaEvent_t& event_wrapper::get_event() { return m_event; }
 
+// -------------------------------------------------------------
+// Helper functions for tensor operations
+// -------------------------------------------------------------
+
+namespace {
+
+using int4 = cuda::array<int, 4>;
+
+/**
+ *  Block dimensions: bdimx x bdimy x bdimz
+ *
+ *  Grid dimensions: (dim[3] / bdimx) x (dim[2] / bdimy) x (dim[1] / bdimx)
+ */
+template <typename TensorDataType>
+__global__ void copy_4d_kernel(
+  int4 dims,
+  const TensorDataType* __restrict__ input,
+  int4 input_strides,
+  TensorDataType* __restrict__ output,
+  int4 output_strides) {
+
+  // Indices
+  const auto& gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const auto& gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const auto& gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const auto& nthreadsx = gridDim.x * blockDim.x;
+  const auto& nthreadsy = gridDim.y * blockDim.y;
+  const auto& nthreadsz = gridDim.z * blockDim.z;
+
+  for (int i0=0; i0<dims[0]; ++i0) {
+    for (int i1=gidz; i1<dims[1]; i1+=nthreadsz) {
+      for (int i2=gidy; i2<dims[2]; i2+=nthreadsy) {
+        for (int i3=gidx; i3<dims[3]; i3+=nthreadsx) {
+          const auto& x = input[i0 * input_strides[0]
+                                + i1 * input_strides[1]
+                                + i2 * input_strides[2]
+                                + i3 * input_strides[3]];
+          auto& y = output[i0 * output_strides[0]
+                           + i1 * output_strides[1]
+                           + i2 * output_strides[2]
+                           + i3 * output_strides[3]];
+          y = x;
+        }
+      }
+    }
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType>
+void copy_tensor(
+  cudaStream_t stream,
+  const std::vector<size_t>& dims,
+  const TensorDataType* input,
+  const std::vector<size_t>& input_strides,
+  TensorDataType* output,
+  const std::vector<size_t>& output_strides) {
+
+  // Check inputs
+  if (dims.empty() || dims.size() > 4) {
+    LBANN_ERROR("invalid number of tensor dimensions (",dims.size(),")");
+  }
+  if (dims.size() != input_strides.size()) {
+    LBANN_ERROR(
+      "number of input strides (",input_strides.size(),") ",
+      "does not match number of tensor dimensions (",dims.size(),")");
+  }
+  if (dims.size() != output_strides.size()) {
+    LBANN_ERROR(
+      "number of output strides (",output_strides.size(),") ",
+      "does not match number of tensor dimensions (",dims.size(),")");
+  }
+
+  // Pad tensor dimensions to 4D
+  std::vector<int>
+    rdims(dims.rbegin(), dims.rend()),
+    input_rstrides(input_strides.rbegin(), input_strides.rend()),
+    output_rstrides(output_strides.rbegin(), output_strides.rend());
+  rdims.resize(4, 1);
+  input_rstrides.resize(4, input_rstrides.back());
+  output_rstrides.resize(4, output_rstrides.back());
+
+  // Launch CUDA kernel
+  const auto size = std::accumulate(
+    dims.begin(), dims.end(), 1, std::multiplies<int>());
+  if (size > 0) {
+    constexpr size_t block_size = 64;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    block_dims.y = 1;
+    block_dims.z = 1;
+    grid_dims.x = (rdims[0] + block_dims.x - 1) / block_dims.x;
+    grid_dims.y = (rdims[1] + block_dims.y - 1) / block_dims.y;
+    grid_dims.z = (rdims[2] + block_dims.z - 1) / block_dims.z;
+    grid_dims.y = El::Min(grid_dims.y, 65535);
+    grid_dims.z = El::Min(grid_dims.z, 65535);
+    copy_4d_kernel<<<grid_dims, block_dims, 0, stream>>>(
+      {rdims[3], rdims[2], rdims[1], rdims[0]},
+      input,
+      {input_rstrides[3], input_rstrides[2],
+          input_rstrides[1], input_rstrides[0]},
+      output,
+      {output_rstrides[3], output_rstrides[2],
+          output_rstrides[1], output_rstrides[0]});
+  }
+
+}
+
+#if defined(LBANN_HAS_HALF) && defined(LBANN_HAS_GPU_HALF)
+template <>
+void copy_tensor<cpu_fp16>(
+  cudaStream_t stream,
+  const std::vector<size_t>& dims,
+  const cpu_fp16* input,
+  const std::vector<size_t>& input_strides,
+  cpu_fp16* output,
+  const std::vector<size_t>& output_strides) {
+  copy_tensor<fp16>(
+    stream,
+    dims,
+    reinterpret_cast<const fp16*>(input),
+    input_strides,
+    reinterpret_cast<fp16*>(output),
+    output_strides);
+}
+#endif // defined(LBANN_HAS_HALF) && defined(LBANN_HAS_GPU_HALF)
+
+// Explicit template instantiation
+#define PROTO(T)                                \
+  template void copy_tensor<T>(                 \
+    cudaStream_t stream,                        \
+    const std::vector<size_t>& dims,            \
+    const T* input,                             \
+    const std::vector<size_t>& input_strides,   \
+    T* output,                                  \
+    const std::vector<size_t>& output_strides);
+#define LBANN_INSTANTIATE_GPU_HALF
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+
 } // namespace cuda
 } // namespace lbann
 

From a40e35678ce6442ef795e2671f14a38e4774fcd0 Mon Sep 17 00:00:00 2001
From: Tim Moon <moon13@llnl.gov>
Date: Thu, 17 Sep 2020 20:44:53 -0700
Subject: [PATCH 30/36] Add optimized launcher defaults for rzansel (#1632)

---
 python/lbann/contrib/lc/launcher.py | 2 +-
 python/lbann/contrib/lc/systems.py  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py
index b31b6a7b763..9aaf54c9598 100644
--- a/python/lbann/contrib/lc/launcher.py
+++ b/python/lbann/contrib/lc/launcher.py
@@ -79,7 +79,7 @@ def set_environment(key, default):
     set_environment('MV2_USE_RDMA_CM', 0)
 
     # Optimizations for Sierra-like systems
-    if system in ('sierra', 'lassen'):
+    if system in ('sierra', 'lassen', 'rzansel'):
 
         # Set thread affinity
         # Note: Aluminum's default thread affinity is incorrect since
diff --git a/python/lbann/contrib/lc/systems.py b/python/lbann/contrib/lc/systems.py
index 8e5641572ce..49a260f4559 100644
--- a/python/lbann/contrib/lc/systems.py
+++ b/python/lbann/contrib/lc/systems.py
@@ -23,6 +23,7 @@ def __init__(self, cores_per_node, gpus_per_node, scheduler):
     'lassen':   SystemParams(44, 4, 'lsf'),
     'ray':      SystemParams(40, 4, 'lsf'),
     'sierra':   SystemParams(44, 4, 'lsf'),
+    'rzansel':  SystemParams(44, 4, 'lsf'),
 }
 
 # Detect system

From 4117b0163c74371e990834b1ff1ed959f4aa3ef4 Mon Sep 17 00:00:00 2001
From: Jae-Seung Yeom <JaeseungYeom@users.noreply.github.com>
Date: Tue, 22 Sep 2020 00:05:56 -0700
Subject: [PATCH 31/36] Sample list image bcast (#1401)

* sample list applied to imagenet data reader

* fix the way to parse index list protobuf variable for imagenet reader

* Add a method to reorder the sample list back to its original order
before all_gather as needed (when a loading stride greater than 1
is used). This makes sure that the order of samples matches that
of the labels in data store.

* make sure the order of samples loaded by data reader is consistent with
that of the labels in data store.

* do not remove the path prefix of the index_list passed from prototext input

* fix the reodering of sample list to undo interleaving

* add option to keep the sample order or ignore that for performance

* show the full path in the error message about missing data files

* replace all the instances of "index_list" with "sample_list"

* resolve merge conflicts

* add label filename line to the base sample list format
clean up sample list header set up
change sample list type keywords

* data reader prototext in model zoo examples to include sample list

* allow keeping the order of loaded samples same as it is in the file
to make testing and validation easier

* allow overriding samples file directory from command line option data_filedir_*

* only trainer master loads the whole sample list file into a buffer, and broadcasts it.
Then, everyone constructs a sample list from the buffer in memory.

* disable file existence check by default

* fix the outdated format of sample list header written out

* resolve merge conflict

* update data_reader_jag_conduit for applying data_filedir override

* sample list header backward compatibility

* change m_sample_list from private to protected
update the calculation to estimate the length of string converted from sample list object

* replace the label map implemented as an unordered map with a vector

* use option '--load_full_sample_list_once' for tests with imagenet

* 'keep_sample_order' option available as a command line option in addition to the prototext variable of data reader

* fix the sample list path in the prototext examples, and fix the sample list path customization for multi-trainer/model

* Allow the original imagenet data list file by generating the sample list header on-the-fly

* fix the filedir overriding

* update the message when writting sample list out.
comments on the sample list backward compatibility

* remove the feature to override data_filedir in sample list

* Add a function to dump a sample-label list to faciliate the verification of
sample list content for imagenet. Enable it by the commnad line option
`--write_sample_label_list` and combine it with `--keep_sample_order`

* change the imagenet data reader prototext to use compatibility mode

* Enable sample list path on LC for the data reader of the python front-end

* Bug fix for merge.

Co-authored-by: Brian C. Van Essen <vanessen1@llnl.gov>
---
 .../data/imagenet/data_reader.prototext       |  13 +-
 .../test_integration_alexnet.py               |   3 +-
 .../test_integration_resnet50.py              |   3 +-
 include/lbann/data_readers/data_reader.hpp    |  29 +-
 .../lbann/data_readers/data_reader_image.hpp  |  38 +-
 .../data_readers/data_reader_jag_conduit.hpp  |   8 +-
 include/lbann/data_readers/sample_list.hpp    |  93 ++++-
 .../lbann/data_readers/sample_list_impl.hpp   | 358 ++++++++++++----
 .../data_readers/sample_list_open_files.hpp   |  13 +-
 .../sample_list_open_files_impl.hpp           |  68 +--
 include/lbann/proto/proto_common.hpp          |   8 +-
 include/lbann/utils/file_utils.hpp            |   2 +-
 .../data_readers/data_reader_jag.prototext    |  14 +-
 python/lbann/contrib/lc/paths.py              |  47 +++
 src/data_readers/data_reader.cpp              |  23 +-
 src/data_readers/data_reader_image.cpp        | 387 ++++++++++++++++--
 src/data_readers/data_reader_imagenet.cpp     |   5 +-
 src/data_readers/data_reader_jag_conduit.cpp  | 157 ++++---
 src/data_store/data_store_conduit.cpp         |  20 +-
 src/proto/proto_common.cpp                    |  55 ++-
 src/proto/reader.proto                        |  12 +-
 src/utils/file_utils.cpp                      |  12 +-
 src/utils/lbann_library.cpp                   |   4 +-
 23 files changed, 1063 insertions(+), 309 deletions(-)

diff --git a/bamboo/common_python/data/imagenet/data_reader.prototext b/bamboo/common_python/data/imagenet/data_reader.prototext
index 3f4e0270f3f..08ddf8b8161 100644
--- a/bamboo/common_python/data/imagenet/data_reader.prototext
+++ b/bamboo/common_python/data/imagenet/data_reader.prototext
@@ -3,9 +3,11 @@ data_reader {
     name: "imagenet"
     role: "train"
     shuffle: true
-    data_filedir: "path/to/ILSVRC2012/train"
-    data_filename: "path/to/ILSVRC2012/labels/train.txt"
+    data_filedir: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/train/"
+    data_filename: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/labels/train.txt"
+    label_filename: ""
     validation_percent: 0.0
+    absolute_sample_count: 0
     percent_of_data_to_use: 1.0
     num_labels: 1000
 
@@ -34,8 +36,11 @@ data_reader {
   reader {
     name: "imagenet"
     role: "validate"
-    data_filedir: "path/to/ILSVRC2012/val"
-    data_filename: "path/to/ILSVRC2012/labels/val.txt"
+    shuffle: true
+    data_filedir: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/val/"
+    data_filename: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/labels/val.txt"
+    label_filename: ""
+    absolute_sample_count: 0
     percent_of_data_to_use: 1.0
     num_labels: 1000
 
diff --git a/bamboo/integration_tests/test_integration_alexnet.py b/bamboo/integration_tests/test_integration_alexnet.py
index 576b2852204..d54954240e2 100644
--- a/bamboo/integration_tests/test_integration_alexnet.py
+++ b/bamboo/integration_tests/test_integration_alexnet.py
@@ -186,5 +186,6 @@ def func(cluster, exes, dirname, weekly):
 # Create test functions that can interact with PyTest
 for _test_func in tools.create_tests(setup_experiment,
                                      __file__,
-                                     nodes=num_nodes):
+                                     nodes=num_nodes,
+                                     lbann_args=['--load_full_sample_list_once']):
     globals()[_test_func.__name__] = augment_test_func(_test_func)
diff --git a/bamboo/integration_tests/test_integration_resnet50.py b/bamboo/integration_tests/test_integration_resnet50.py
index 360e3fb20e1..77e629bf4d9 100644
--- a/bamboo/integration_tests/test_integration_resnet50.py
+++ b/bamboo/integration_tests/test_integration_resnet50.py
@@ -184,5 +184,6 @@ def func(cluster, exes, dirname, weekly):
 # Create test functions that can interact with PyTest
 for _test_func in tools.create_tests(setup_experiment,
                                      __file__,
-                                     nodes=num_nodes):
+                                     nodes=num_nodes,
+                                     lbann_args=['--load_full_sample_list_once']):
     globals()[_test_func.__name__] = augment_test_func(_test_func)
diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp
index c2a4c8db152..cddbb491a8c 100644
--- a/include/lbann/data_readers/data_reader.hpp
+++ b/include/lbann/data_readers/data_reader.hpp
@@ -91,7 +91,7 @@ class generic_data_reader {
     m_world_master_mini_batch_adjustment(0),
     m_num_parallel_readers(0), m_rank_in_model(0),
     m_max_files_to_load(0),
-    m_file_dir(""), m_data_index_list(""), m_data_fn(""), m_label_fn(""),
+    m_file_dir(""), m_data_sample_list(""), m_data_fn(""), m_label_fn(""),
     m_shuffle(shuffle), m_absolute_sample_count(0), m_validation_percent(0.0),
     m_use_percent(1.0),
     m_master(false),
@@ -103,6 +103,7 @@ class generic_data_reader {
     m_procs_per_partition(1),
     m_io_thread_pool(nullptr),
     m_jag_partitioned(false),
+    m_keep_sample_order(false),
     m_trainer(nullptr),
     m_issue_warning(true)
   {
@@ -165,16 +166,22 @@ class generic_data_reader {
   std::string get_local_file_dir() const;
 
   /**
-   * Set the index list for your data (images, etc).
-   * The index lists contains an enumeration of all samples in the
+   * Set the sample list for your data (images, etc).
+   * The sample lists contains an enumeration of all samples in the
    * data set.
    */
-  void set_data_index_list(std::string s);
+  void set_data_sample_list(std::string s);
 
   /**
-   * Returns the complete index list for your data set.
+   * Returns the complete sample list for your data set.
    */
-  std::string get_data_index_list() const;
+  std::string get_data_sample_list() const;
+
+  /**
+   * To facilictate the testing, maintain the order of loaded samples
+   * in the sample list as it is in the list file.
+   */
+  void keep_sample_order(bool same_order = false);
 
   /**
    * Set the filename for your data (images, etc).
@@ -596,9 +603,9 @@ class generic_data_reader {
   /// returns true if the data set is partitioned
   bool is_partitioned() const { return m_is_partitioned; }
 
-  /// Does the data reader have a unqiue index list per model
+  /// Does the data reader have a unqiue sample list per model
   virtual bool has_list_per_model() const { return false; }
-  /// Does the data reader have a unqiue index list per trainer
+  /// Does the data reader have a unqiue sample list per trainer
   virtual bool has_list_per_trainer() const { return false; }
 
 
@@ -782,7 +789,7 @@ class generic_data_reader {
   size_t m_max_files_to_load;
   std::string m_file_dir;
   std::string m_local_file_dir;
-  std::string m_data_index_list;
+  std::string m_data_sample_list;
   std::string m_data_fn;
   std::string m_label_fn;
   bool m_shuffle;
@@ -855,6 +862,10 @@ class generic_data_reader {
   /// owns a unique subset of the data
   bool m_jag_partitioned;
 
+  /** Whether to keep the order of loaded samples same as it is in the
+   *  file to make testing and validation easier */
+  bool m_keep_sample_order;
+
   /// called by fetch_data a single time if m_jag_partitioned = true;
   /// this sets various member variables (num_iterations, m_reset_mini_batch_index,
   /// etc.
diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp
index cde595e781e..37c6f111e73 100644
--- a/include/lbann/data_readers/data_reader_image.hpp
+++ b/include/lbann/data_readers/data_reader_image.hpp
@@ -30,6 +30,7 @@
 #define IMAGE_DATA_READER_HPP
 
 #include "data_reader.hpp"
+#include "sample_list.hpp"
 #include "lbann/data_store/data_store_conduit.hpp"
 
 namespace lbann {
@@ -38,6 +39,10 @@ class image_data_reader : public generic_data_reader {
   using img_src_t = std::string;
   using label_t = int;
   using sample_t = std::pair<img_src_t, label_t>;
+  using sample_name_t = img_src_t;
+  using sample_list_t = sample_list<sample_name_t>;
+  using sample_idx_t = sample_list_t::sample_idx_t;
+  using labels_t = std::vector<label_t>;
 
   image_data_reader(bool shuffle = true);
   image_data_reader(const image_data_reader&);
@@ -78,21 +83,16 @@ class image_data_reader : public generic_data_reader {
     return {m_image_num_channels, m_image_height, m_image_width};
   }
 
-  /// Return the sample list of current minibatch
-  std::vector<sample_t> get_image_list_of_current_mb() const;
-
   /// Allow read-only access to the entire sample list
-  const std::vector<sample_t>& get_image_list() const {
-    return m_image_list;
+  const sample_list_t& get_sample_list() const {
+    return m_sample_list;
   }
 
   /**
    * Returns idx-th sample in the initial loading order.
    * The second argument is only to facilitate overloading, and not to be used by users.
    */
-  sample_t get_sample(const size_t idx) const {
-    return m_image_list.at(idx);
-  }
+  sample_t get_sample(const size_t idx) const;
 
   void do_preload_data_store() override;
 
@@ -106,15 +106,33 @@ class image_data_reader : public generic_data_reader {
   bool fetch_label(Mat& Y, int data_id, int mb_idx) override;
   void set_linearized_image_size();
 
+  /** Dump the image list file in which each line consists of the file name
+   *  and the label of a sample */
+  void dump_sample_label_list(const std::string& dump_file_name);
+  /// Rely on pre-determined list of samples.
+  void load_list_of_samples(const std::string filename);
+  /// Load the sample list from a serialized archive from another rank
+  void load_list_of_samples_from_archive(const std::string& sample_list_archive);
+  /// Use the imagenet image list file, and generate sample list header on-the-fly
+  void gen_list_of_samples();
+  /// Load the labels for samples
+  void load_labels(std::vector<char>& preloaded_buffer);
+  /// Read the labels from an open input stream
+  void read_labels(std::istream& istrm);
+  /// Return the number of lines in the input stream
+  size_t determine_num_of_samples(std::istream& istrm) const;
+
   std::string m_image_dir; ///< where images are stored
-  std::vector<sample_t> m_image_list; ///< list of image files and labels
   int m_image_width; ///< image width
   int m_image_height; ///< image height
   int m_image_num_channels; ///< number of image channels
   int m_image_linearized_size; ///< linearized image size
   int m_num_labels; ///< number of labels
 
-  bool  load_conduit_nodes_from_file(const std::unordered_set<int> &data_ids);
+  sample_list_t m_sample_list;
+  labels_t m_labels;
+
+  bool load_conduit_nodes_from_file(const std::unordered_set<int> &data_ids);
 
 };
 
diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp
index be53df9aced..558a766094d 100644
--- a/include/lbann/data_readers/data_reader_jag_conduit.hpp
+++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp
@@ -169,9 +169,9 @@ class data_reader_jag_conduit : public generic_data_reader {
   void check_image_data();
 #endif // _JAG_OFFLINE_TOOL_MODE_
 
-  /// Set every reader instances in a trainer to have an independent index list
+  /// Set every reader instances in a trainer to have an independent sample list
   void set_list_per_trainer(bool flag) { m_list_per_trainer = flag; };
-  /// Set every reader instances in a model to have an independent index list
+  /// Set every reader instances in a model to have an independent sample list
   void set_list_per_model(bool flag) { m_list_per_model = flag; };
 
   bool has_list_per_model() const override { return m_list_per_model; }
@@ -316,8 +316,10 @@ class data_reader_jag_conduit : public generic_data_reader {
    * the number of models and the mini batch size.
    */
   bool check_num_parallel_readers(long data_set_size);
+  /// Check the consistency of the schema of the first sample
+  void sample_schema_check(const bool check_data);
   /// Rely on pre-determined list of samples.
-  void load_list_of_samples(const std::string filename, size_t stride=1, size_t offset=0);
+  void load_list_of_samples(const std::string filename);
   /// Load the sample list from a serialized archive from another rank
   void load_list_of_samples_from_archive(const std::string& sample_list_archive);
 
diff --git a/include/lbann/data_readers/sample_list.hpp b/include/lbann/data_readers/sample_list.hpp
index 6d4aa5e051f..ad64dff14d0 100644
--- a/include/lbann/data_readers/sample_list.hpp
+++ b/include/lbann/data_readers/sample_list.hpp
@@ -18,28 +18,49 @@
 
 namespace lbann {
 
-static const std::string sample_exclusion_list = "CONDUIT_HDF5_EXCLUSION";
-static const std::string sample_inclusion_list = "CONDUIT_HDF5_INCLUSION";
+static const std::string multi_sample_exclusion = "MULTI-SAMPLE_EXCLUSION";
+static const std::string multi_sample_inclusion = "MULTI-SAMPLE_INCLUSION";
+static const std::string single_sample = "SINGLE-SAMPLE";
 
 struct sample_list_header {
+  /// Whether each data file includes multiple samples
+  bool m_is_multi_sample;
+  /// Whether to list the IDs of samples to exclude or to include
   bool m_is_exclusive;
+  /// Whether to read the header line for a label file
+  bool m_no_label_header;
   /// Number of included samples
   size_t m_included_sample_count;
   /// Number of excluded samples
   size_t m_excluded_sample_count;
   size_t m_num_files;
+  /// Data file directory
   std::string m_file_dir;
-  std::string m_sample_list_filename;
+  std::string m_sample_list_name;
+  std::string m_label_filename;
 
   sample_list_header();
 
+  void set_sample_list_type(const std::string& line1);
+  void set_sample_count(const std::string& line2);
+  void set_data_file_dir(const std::string& line3);
+  void set_label_filename(const std::string& line4);
+
+  bool is_multi_sample() const;
   bool is_exclusive() const;
+  bool use_label_header() const;
   size_t get_sample_count() const;
   size_t get_num_files() const;
-  const std::string& get_sample_list_filename() const;
   const std::string& get_file_dir() const;
+  const std::string& get_sample_list_name() const;
+  /// Save the filename or stream name of this sample list for debugging
+  void set_sample_list_name(const std::string& n);
+  const std::string& get_label_filename() const;
   template <class Archive> void serialize( Archive & ar ) {
-    ar(m_is_exclusive, m_included_sample_count, m_excluded_sample_count, m_num_files, m_file_dir, m_sample_list_filename);
+    ar(m_is_multi_sample, m_is_exclusive, m_no_label_header,
+       m_included_sample_count, m_excluded_sample_count,
+       m_num_files, m_file_dir,
+       m_sample_list_name, m_label_filename);
   }
 };
 
@@ -53,6 +74,10 @@ class sample_list {
   using sample_t = std::template pair<sample_file_id_t, sample_name_t>;
   /// Type for the list of samples
   using samples_t = std::template vector< sample_t >;
+  /// Type for the index into the sample list
+  using sample_idx_t = typename samples_t::size_type;
+  /// Type for the map from sample name to the sample list index
+  using sample_map_t = std::unordered_map<sample_name_t, sample_idx_t>;
   /// Mapping of the file index to the filename
   using file_id_stats_v_t = std::vector< std::string >;
 
@@ -64,11 +89,17 @@ class sample_list {
 
   void copy_members(const sample_list& rhs);
 
-  /// Load a sample list file
-  void load(const std::string& samplelist_file, size_t stride=1, size_t offset=0);
+  /// Load a sample list file using the given stride and offset on the sample sequence
+  void load(std::istream& istrm, size_t stride=1, size_t offset=0);
 
-  /// Load the header of a sample list file
-  sample_list_header load_header(const std::string& samplelist_file) const;
+  /** Load a sample list file using the stride as the number of processes per
+   *  trainer and the offset as the current rank within the trainer if
+   *  interleaving option is on.
+   */
+  void load(const std::string& samplelist_file, const lbann_comm& comm, bool interleave);
+  void load(std::istream& istrm, const lbann_comm& comm, bool interleave);
+  /// Load sample list using the given header instead of reading it from the input stream
+  void load(const sample_list_header& header, std::istream& istrm, const lbann_comm& comm, bool interleave);
 
   /// Restore a sample list from a serialized string
   void load_from_string(const std::string& samplelist);
@@ -103,6 +134,7 @@ class sample_list {
   virtual const std::string& get_samples_filename(sample_file_id_t id) const;
 
   const std::string& get_samples_dirname() const;
+  const std::string& get_label_filename() const;
 
   void all_gather_archive(const std::string &archive, std::vector<std::string>& gathered_archive, lbann_comm& comm);
   void all_gather_archive_new(const std::string &archive, std::vector<std::string>& gathered_archive, lbann_comm& comm);
@@ -110,13 +142,33 @@ class sample_list {
   template<typename T> size_t all_gather_field(T data, std::vector<T>& gathered_data, lbann_comm& comm);
   virtual void all_gather_packed_lists(lbann_comm& comm);
 
+  /// Set to maintain the original sample order as listed in the file
+  void keep_sample_order(bool keep);
+
+  /// Manually set the sample list name, which can be used for stream-based sources
+  void set_sample_list_name(const std::string& n);
+
+  /// Set to check the existence of data file in the list
+  void set_data_file_check();
+  /// Set not to check the existence of data file in the list
+  void unset_data_file_check();
+
+  /// Build map from sample names to indices for sample list
+  void build_sample_map_from_name_to_index();
+
+  /// Clear the map from sample names to indices
+  void clear_sample_map_from_name_to_index();
+
+  /// Return the index of the sample with the specified name
+  sample_idx_t get_sample_index(const sample_name_t& sn );
+
  protected:
 
   /// Reads a header line from the sample list given as a stream, and use the info string for error message
-  std::string read_header_line(std::istream& ifs, const std::string& filename, const std::string& info) const;
+  std::string read_header_line(std::istream& ifs, const std::string& listname, const std::string& info);
 
   /// Reads the header of a sample list
-  sample_list_header read_header(std::istream& istrm, const std::string& filename) const;
+  void read_header(std::istream& istrm);
 
   /// read the body of a sample list, which is the list of sample files, where each file contains a single sample.
   virtual void read_sample_list(std::istream& istrm, size_t stride=1, size_t offset=0);
@@ -125,7 +177,7 @@ class sample_list {
   virtual void assign_samples_name();
 
   /// Reads a sample list and populates the internal list
-  size_t get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride=1, size_t offset=0);
+  size_t get_samples_per_file(std::istream& istrm, size_t stride=1, size_t offset=0);
 
   /// Add the header info to the given string
   void write_header(std::string& sstr, size_t num_files) const;
@@ -135,14 +187,29 @@ class sample_list {
 
   virtual void set_samples_filename(sample_file_id_t id, const std::string& filename);
 
+  /// Reorder the sample list to its initial order
+  virtual void reorder();
+
  protected:
   /// header info of sample list
   sample_list_header m_header;
 
- private:
+  /// The stride used in loading sample list file
+  size_t m_stride;
+
+  /// maintain the original sample order as listed in the file
+  bool m_keep_order;
+
+  /// Whether to check the existence of data file
+  bool m_check_data_file;
+
   /// List of all samples with a file identifier and sample name for each sample
   samples_t m_sample_list;
 
+  /// Map from sample name to the corresponding index into the sample list
+  sample_map_t m_map_name_to_idx;
+
+ private:
   /// Maps sample's file id to file names, file descriptors, and use counts
   file_id_stats_v_t m_file_id_stats_map;
 
diff --git a/include/lbann/data_readers/sample_list_impl.hpp b/include/lbann/data_readers/sample_list_impl.hpp
index 0f161bed61f..80719640467 100644
--- a/include/lbann/data_readers/sample_list_impl.hpp
+++ b/include/lbann/data_readers/sample_list_impl.hpp
@@ -13,6 +13,7 @@
 #include <memory>
 #include <type_traits>
 #include <limits>
+#include <algorithm>
 
 #include <cereal/archives/binary.hpp>
 #include <unistd.h>
@@ -76,15 +77,81 @@ template<> inline std::string to_sample_name_t<std::string>(const std::string& s
 //------------------------
 
 inline sample_list_header::sample_list_header()
-  : m_is_exclusive(false), m_included_sample_count(0u),
-    m_excluded_sample_count(0u), m_num_files(0u),
-    m_file_dir("") {
+  : m_is_multi_sample(false), m_is_exclusive(false), m_no_label_header(false),
+    m_included_sample_count(0u), m_excluded_sample_count(0u), m_num_files(0u),
+    m_file_dir(""), m_sample_list_name(""), m_label_filename("") {
+}
+
+inline void sample_list_header::set_sample_list_type(const std::string& line1) {
+  std::stringstream header1(line1);
+  std::string sample_list_type;
+  header1 >> sample_list_type;
+
+  std::for_each(sample_list_type.begin(), sample_list_type.end(),
+                [](char& c){ c = std::toupper(c); });
+
+  m_is_multi_sample = false;
+  m_is_exclusive = false;
+  m_no_label_header = false;
+
+  if (sample_list_type == single_sample) {
+  } else if (sample_list_type == multi_sample_inclusion) {
+    m_is_multi_sample = true;
+    m_is_exclusive = false;
+  } else if (sample_list_type == multi_sample_exclusion) {
+    m_is_multi_sample = true;
+    m_is_exclusive = true;
+  } else if (sample_list_type == "CONDUIT_HDF5_INCLUSION") {
+    // For backward compatibility
+    m_is_multi_sample = true;
+    m_is_exclusive = false;
+    m_no_label_header = true; // old format does not use a line for label file
+  } else if (sample_list_type == "CONDUIT_HDF5_EXCLUSION") {
+    // For backward compatibility
+    m_is_multi_sample = true;
+    m_is_exclusive = true;
+    m_no_label_header = true;
+  } else {
+    LBANN_ERROR("Unknown sample list type: ", sample_list_type);
+  }
+}
+
+inline void sample_list_header::set_sample_count(const std::string& line2) {
+  std::stringstream header2(line2);
+  if (m_is_multi_sample) {
+    header2 >> m_included_sample_count;
+    header2 >> m_excluded_sample_count;
+  }
+  header2 >> m_num_files;
+
+  if (!m_is_multi_sample) {
+    m_included_sample_count = m_num_files;
+    m_excluded_sample_count = 0ul;
+  }
+}
+
+inline void sample_list_header::set_data_file_dir(const std::string& line3) {
+  std::stringstream header3(line3);
+  header3 >> m_file_dir;
+}
+
+inline void sample_list_header::set_label_filename(const std::string& line4) {
+  std::stringstream header4(line4);
+  header4 >> m_label_filename;
+}
+
+inline bool sample_list_header::is_multi_sample() const {
+  return m_is_multi_sample;
 }
 
 inline bool sample_list_header::is_exclusive() const {
   return m_is_exclusive;
 }
 
+inline bool sample_list_header::use_label_header() const {
+  return !m_no_label_header;
+}
+
 inline size_t sample_list_header::get_sample_count() const {
   return m_included_sample_count;
 }
@@ -93,20 +160,29 @@ inline size_t sample_list_header::get_num_files() const {
   return m_num_files;
 }
 
-inline const std::string& sample_list_header::get_sample_list_filename() const {
-  return m_sample_list_filename;
-}
-
 inline const std::string& sample_list_header::get_file_dir() const {
   return m_file_dir;
 }
 
+inline const std::string& sample_list_header::get_sample_list_name() const {
+  return m_sample_list_name;
+}
+
+inline void sample_list_header::set_sample_list_name(const std::string& n) {
+  m_sample_list_name = n;
+}
+
+inline const std::string& sample_list_header::get_label_filename() const {
+  return m_label_filename;
+}
+
 //------------------
 //   sample_list
 //------------------
 
 template <typename sample_name_t>
-inline sample_list<sample_name_t>::sample_list() {
+inline sample_list<sample_name_t>::sample_list()
+: m_stride(1ul), m_keep_order(true), m_check_data_file(false) {
 }
 
 template <typename sample_name_t>
@@ -149,6 +225,9 @@ template <typename sample_name_t>
 inline void sample_list<sample_name_t>
 ::copy_members(const sample_list& rhs) {
   m_header = rhs.m_header;
+  m_stride = rhs.m_stride;
+  m_keep_order = rhs.m_keep_order;
+  m_check_data_file = rhs.m_check_data_file;
   m_sample_list = rhs.m_sample_list;
 
   /// Keep track of existing filenames
@@ -157,25 +236,53 @@ ::copy_members(const sample_list& rhs) {
 
 template <typename sample_name_t>
 inline void sample_list<sample_name_t>
-::load(const std::string& samplelist_file,
+::load(std::istream& istrm,
        size_t stride, size_t offset) {
-  std::ifstream istr(samplelist_file);
-  get_samples_per_file(istr, samplelist_file, stride, offset);
-  istr.close();
+  m_stride = stride;
+  get_samples_per_file(istrm, stride, offset);
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::load(const std::string& samplelist_file,
+       const lbann_comm& comm,
+       bool interleave) {
+  m_header.set_sample_list_name(samplelist_file);
+  std::ifstream istrm(samplelist_file);
+  load(istrm, comm, interleave);
+  istrm.close();
 }
 
 template <typename sample_name_t>
-inline sample_list_header sample_list<sample_name_t>
-::load_header(const std::string& samplelist_file) const {
-  std::ifstream istr(samplelist_file);
-  return read_header(istr, samplelist_file);
+inline void sample_list<sample_name_t>
+::load(std::istream& istrm,
+       const lbann_comm& comm,
+       bool interleave) {
+  const size_t stride = interleave? comm.get_procs_per_trainer() : 1ul;
+  const size_t offset = interleave? comm.get_rank_in_trainer() : 0ul;
+  load(istrm, stride, offset);
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::load(const sample_list_header& header,
+       std::istream& istrm,
+       const lbann_comm& comm,
+       bool interleave) {
+  m_header = header;
+  const size_t stride = interleave? comm.get_procs_per_trainer() : 1ul;
+  const size_t offset = interleave? comm.get_rank_in_trainer() : 0ul;
+
+  m_stride = stride;
+  read_sample_list(istrm, stride, offset);
 }
 
 template <typename sample_name_t>
 inline void sample_list<sample_name_t>
 ::load_from_string(const std::string& samplelist) {
-  std::istringstream istr(samplelist);
-  get_samples_per_file(istr, "<LOAD_FROM_STRING>", 1, 0);
+  m_header.set_sample_list_name("<LOAD_FROM_STRING>");
+  std::istringstream istrm(samplelist);
+  load(istrm, 1ul, 0ul);
 }
 
 template <typename sample_name_t>
@@ -199,11 +306,11 @@ ::empty() const {
 template <typename sample_name_t>
 inline std::string sample_list<sample_name_t>
 ::read_header_line(std::istream& istrm,
-                   const std::string& filename,
-                   const std::string& info) const {
+                   const std::string& listname,
+                   const std::string& info) {
   if (!istrm.good()) {
     throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
-                          + " :: unable to read the header line of sample list " + filename + " for " + info);
+                          + " :: unable to read the header line of sample list " + listname + " for " + info);
   }
 
   std::string line;
@@ -211,7 +318,7 @@ ::read_header_line(std::istream& istrm,
 
   if (line.empty()) {
     throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
-                          + " :: unable to read the header line of sample list " + filename + " for " + info
+                          + " :: unable to read the header line of sample list " + listname + " for " + info
                           + " -- the line was empty");
   }
   return line;
@@ -219,47 +326,27 @@ ::read_header_line(std::istream& istrm,
 
 
 template <typename sample_name_t>
-inline sample_list_header sample_list<sample_name_t>
-::read_header(std::istream& istrm,
-              const std::string& filename) const {
-  sample_list_header hdr;
-
-  hdr.m_sample_list_filename = filename;
-
-  std::string line1 = read_header_line(istrm, filename, "the exclusiveness");
-  std::stringstream header1(line1);
-
-  std::string line2 = read_header_line(istrm, filename, "the number of samples and the number of files");
-  std::stringstream header2(line2);
-
-  std::string line3 = read_header_line(istrm, filename, "the data file directory");
-  std::stringstream header3(line3);
+inline void sample_list<sample_name_t>
+::read_header(std::istream& istrm) {
+  const std::string listname = m_header.get_sample_list_name();
 
-  std::string sample_list_type;
-  header1 >> sample_list_type;
-  std::for_each(sample_list_type.begin(), sample_list_type.end(), [](char& c){ c = std::toupper(c); });
+  std::string line1 = read_header_line(istrm, listname, "the exclusiveness\n");
+  std::string line2 = read_header_line(istrm, listname, "the number of samples and the number of files\n");
+  std::string line3 = read_header_line(istrm, listname, "the data file directory\n");
 
-  const std::string type_exclusive = sample_exclusion_list;
-  size_t found = sample_list_type.find(type_exclusive);
+  m_header.set_sample_list_type(line1);
+  m_header.set_sample_count(line2);
+  m_header.set_data_file_dir(line3);
 
-  if (found != std::string::npos) {
-    hdr.m_is_exclusive = true;
-  } else {
-    hdr.m_is_exclusive = false;
+  if (m_header.use_label_header()) {
+    std::string line4 = read_header_line(istrm, listname, "the path to label/response file\n");
+    m_header.set_label_filename(line4);
   }
 
-  header2 >> hdr.m_included_sample_count;
-  header2 >> hdr.m_excluded_sample_count;
-  header2 >> hdr.m_num_files;
-
-  header3 >> hdr.m_file_dir;
-
-  if (hdr.get_file_dir().empty() || !check_if_dir_exists(hdr.get_file_dir())) {
-    LBANN_ERROR(std::string{} + "file " + filename
-                 + " :: data root directory '" + hdr.get_file_dir() + "' does not exist.");
+  if (m_header.get_file_dir().empty() || !check_if_dir_exists(m_header.get_file_dir())) {
+    LBANN_ERROR(std::string{} + "file " + listname
+                 + " :: data root directory '" + m_header.get_file_dir() + "' does not exist.");
   }
-
-  return hdr;
 }
 
 
@@ -293,9 +380,9 @@ ::read_sample_list(std::istream& istrm,
 
     const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename;
 
-    if (filename.empty() || !check_if_file_exists(file_path)) {
+    if (filename.empty() || (m_check_data_file && !check_if_file_exists(file_path))) {
       throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
-                            + " :: data file '" + filename + "' does not exist.");
+                            + " :: data file '" + file_path + "' does not exist.");
     }
 
     const sample_file_id_t index = m_file_id_stats_map.size();
@@ -323,10 +410,10 @@ ::read_sample_list(std::istream& istrm,
 template <typename sample_name_t>
 inline size_t sample_list<sample_name_t>
 ::get_samples_per_file(std::istream& istrm,
-                       const std::string& filename,
                        size_t stride, size_t offset) {
-  m_header = read_header(istrm, filename);
+  read_header(istrm);
 
+  m_stride = stride;
   read_sample_list(istrm, stride, offset);
 
   return size();
@@ -412,7 +499,7 @@ ::all_gather_archive_new(const std::string &archive,
   for (auto t : packed_sizes) {
     g += t;
   }
-  if (!me) {
+  if (me == comm.get_trainer_master()) {
     std::cout << "global archive size: " << g << std::endl;
   }
 
@@ -420,7 +507,7 @@ ::all_gather_archive_new(const std::string &archive,
     gathered_archive[p].resize(packed_sizes[p]);
     if (me == p) {
       gathered_archive[p] = archive;
-    } 
+    }
     int sz = packed_sizes[p];
     char *data = const_cast<char*>(gathered_archive[p].data());
     comm.trainer_broadcast<char>(p, data, sz);
@@ -509,23 +596,34 @@ template <class Archive>
 void sample_list<sample_name_t>
 ::serialize( Archive & ar ) {
   ar(m_header, m_sample_list, m_file_id_stats_map);
+  // The member variables that are only meaningful during initial loading
+  // are not included here.
+  // e.g., m_stride, m_keep_order, m_check_data_file
 }
 
 template <typename sample_name_t>
 inline void sample_list<sample_name_t>
 ::write_header(std::string& sstr, size_t num_files) const {
-  // The first line indicate if the list is exclusive or inclusive
-  // The next line contains the number of samples (included and excluded),
-  // as well as the number of files, which are the same in this caes
-  // The next line contains the root data file directory
-
-  sstr += (m_header.is_exclusive()? sample_exclusion_list + "\n" : sample_inclusion_list + "\n");
-  size_t total, included, excluded;
-  get_num_samples(total, included, excluded);
-  /// TODO: clarify the comment below
-  /// Include the number of invalid samples, which for an inclusive index list is always 0
-  sstr += std::to_string(included) + ' '  + std::to_string(excluded) + ' '  + std::to_string(num_files) + '\n';
+  // The first line indicate if the list is single-sample-per-file type,
+  // multi-sample-exclusive or multi-sample-inclusive.
+  // The second line contains the number of samples (included and excluded
+  // when applicable), as well as the number of files.
+  // The third line contains the root data file directory.
+  // The fourth line contains the path to the label file when applicable
+
+  if (m_header.is_multi_sample()) {
+    sstr += (m_header.is_exclusive()? multi_sample_exclusion + "\n" : multi_sample_inclusion + "\n");
+
+    size_t total, included, excluded;
+    get_num_samples(total, included, excluded);
+
+    sstr += std::to_string(included) + ' '  + std::to_string(excluded) + ' '  + std::to_string(num_files) + '\n';
+  } else {
+    sstr += single_sample + "\n";
+    sstr += std::to_string(num_files) + '\n';
+  }
   sstr += m_header.get_file_dir() + '\n';
+  sstr += m_header.get_label_filename() + '\n';
 }
 
 template <typename sample_name_t>
@@ -547,8 +645,21 @@ ::to_string(std::string& sstr) const {
 
   sstr.clear();
 
-  // reserve the string to hold the entire sample lit
-  size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1 + total_len + 1000;
+  static const size_t max_type_len
+    = std::max(std::max(multi_sample_exclusion.size(),
+                        multi_sample_inclusion.size()),
+               single_sample.size());
+
+  static const size_t max_num_len
+    = std::to_string(std::numeric_limits<size_t>::max()).size();
+
+  // reserve the string to hold the entire sample list
+  size_t estimated_len = max_type_len
+                       + max_num_len + 2
+                       + m_header.get_file_dir().size()
+                       + m_header.get_label_filename().size()
+                       + 4 // sizeof('\n') * 4
+                       + total_len + 1000;
   sstr.reserve(estimated_len);
 
   // write the list header
@@ -614,11 +725,17 @@ ::get_samples_filename(sample_file_id_t id) const {
 }
 
 template <typename sample_name_t>
-inline   const std::string& sample_list<sample_name_t>
+inline const std::string& sample_list<sample_name_t>
 ::get_samples_dirname() const {
   return m_header.get_file_dir();
 }
 
+template <typename sample_name_t>
+inline const std::string& sample_list<sample_name_t>
+::get_label_filename() const {
+  return m_header.get_label_filename();
+}
+
 template <typename sample_name_t>
 inline void sample_list<sample_name_t>
 ::set_samples_filename(sample_file_id_t id, const std::string& filename) {
@@ -637,7 +754,7 @@ ::assign_samples_name() {
     }
   } else if constexpr (std::is_same<std::string, sample_name_t>::value) {
     for (auto& s: m_sample_list) {
-      s.second = s.first;
+      s.second = get_samples_filename(s.first);
     }
   } else {
     LBANN_ERROR(std::string{} + " :: base class does not implement this method"
@@ -674,7 +791,7 @@ ::assign_samples_name() {
 template<> inline void sample_list<std::string>
 ::assign_samples_name() {
   for (auto& s: m_sample_list) {
-    s.second = s.first;
+    s.second = get_samples_filename(s.first);
   }
 }
 
@@ -739,9 +856,92 @@ ::all_gather_packed_lists(lbann_comm& comm) {
     }
   }
 
+  if (m_keep_order) {
+    reorder();
+  }
+
   assign_samples_name();
 
   return;
 }
 
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::reorder() {
+  if (m_stride > 1ul) { // undo interleaving
+    const size_t sz = m_sample_list.size();
+    const size_t s = sz/m_stride;
+    const size_t s_more = (sz + m_stride - 1ul)/m_stride;
+    const size_t n_more = sz - s * m_stride;
+
+    samples_t tmp_sample_list;
+    tmp_sample_list.reserve(s_more * m_stride);
+
+    for (size_t i = 0ul; i < s_more; ++i) {
+      for (size_t j = i, k = 0ul; j < sz; ++k) {
+        tmp_sample_list.push_back(m_sample_list[j]);
+        //if (tmp_sample_list.size() == sz) break;
+        j += ((k < n_more)? s_more : s);
+      }
+    }
+    tmp_sample_list.resize(sz);
+    std::swap(m_sample_list, tmp_sample_list);
+    m_stride = 1ul;
+  }
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::build_sample_map_from_name_to_index() {
+  m_map_name_to_idx.clear();
+  for (size_t i = 0ul; i < m_sample_list.size(); ++i) {
+    m_map_name_to_idx.insert(std::make_pair(m_sample_list[i].second, i));
+  }
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::clear_sample_map_from_name_to_index() {
+  m_map_name_to_idx.clear();
+  m_map_name_to_idx.rehash(0);
+  sample_map_t tmp;
+  tmp.rehash(0);
+  tmp.swap(m_map_name_to_idx);
+}
+
+template <typename sample_name_t>
+inline typename sample_list<sample_name_t>::sample_idx_t sample_list<sample_name_t>
+::get_sample_index(const sample_name_t& sn) {
+  typename sample_map_t::const_iterator it = m_map_name_to_idx.find(sn);
+  if (it == m_map_name_to_idx.cend()) {
+    return size();
+    //LBANN_ERROR(" :: cannot find the sample name ", lbann::to_string(sn));
+  }
+  return it->second;
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::keep_sample_order(bool keep) {
+  m_keep_order = keep;
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::set_sample_list_name(const std::string& n) {
+  m_header.set_sample_list_name(n);
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::set_data_file_check() {
+  m_check_data_file = true;
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::unset_data_file_check() {
+  m_check_data_file = false;
+}
+
 } // end of namespace lbann
diff --git a/include/lbann/data_readers/sample_list_open_files.hpp b/include/lbann/data_readers/sample_list_open_files.hpp
index 57bfb89980e..4eac42c5c47 100644
--- a/include/lbann/data_readers/sample_list_open_files.hpp
+++ b/include/lbann/data_readers/sample_list_open_files.hpp
@@ -16,13 +16,13 @@ class sample_list_open_files : public sample_list<sample_name_t> {
   using sample_file_id_t = std::size_t;
   /** To describe a sample as a pair of the file to which it belongs and its name
       Each file may contain multiple samples. */
-  using sample_t = std::pair<sample_file_id_t, sample_name_t>;
+  using sample_t = typename sample_list<sample_name_t>::sample_t;
   /// Information for each file used by the sample list: includes the file name, file descriptor, and
   /// and a queue of each step and substep when data will be loaded from the file
   using file_id_stats_t = std::tuple<std::string, file_handle_t, std::deque<std::pair<int,int>>>;
 
   /// Type for the list of samples
-  using samples_t = std::template vector< sample_t >;
+  using samples_t = typename sample_list<sample_name_t>::samples_t;
   /// Mapping of the file index to the statistics for each file
   using file_id_stats_v_t = std::vector< file_id_stats_t >; // rename to sample_to_file_v or something
   /// Type for the map of file descriptors to usage step and substep
@@ -55,12 +55,6 @@ class sample_list_open_files : public sample_list<sample_name_t> {
   /// Serialize this sample list into an std::string object
   bool to_string(std::string& sstr) const override;
 
-  /// Allow read-only access to the internal list data
-  const samples_t& get_list() const;
-
-  /// Allow read-only access to the metadata of the idx-th sample in the list
-  const sample_t& operator[](size_t idx) const;
-
   const std::string& get_samples_filename(sample_file_id_t id) const override;
 
   file_handle_t get_samples_file_handle(sample_file_id_t id) const;
@@ -130,9 +124,6 @@ class sample_list_open_files : public sample_list<sample_name_t> {
   file_id_stats_v_t m_file_id_stats_map;
 
  private:
-  /// List of all samples with a file identifier and sample name for each sample
-  samples_t m_sample_list;
-
   /// Track the number of samples per file
   std::unordered_map<std::string, size_t> m_file_map;
 
diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp
index 565b016bd22..ceec5493ab6 100644
--- a/include/lbann/data_readers/sample_list_open_files_impl.hpp
+++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp
@@ -48,7 +48,6 @@ template <typename sample_name_t, typename file_handle_t>
 inline void sample_list_open_files<sample_name_t, file_handle_t>
 ::copy_members(const sample_list_open_files& rhs) {
   sample_list<sample_name_t>::copy_members(rhs);
-  m_sample_list = rhs.m_sample_list;
   m_file_map = rhs.m_file_map;
   m_max_open_files = rhs.m_max_open_files;
 
@@ -71,7 +70,7 @@ ::copy_members(const sample_list_open_files& rhs) {
 template <typename sample_name_t, typename file_handle_t>
 inline size_t sample_list_open_files<sample_name_t, file_handle_t>
 ::size() const {
-  return m_sample_list.size();
+  return this->m_sample_list.size();
 }
 
 template <typename sample_name_t, typename file_handle_t>
@@ -111,7 +110,7 @@ ::read_exclusive_list(std::istream& istrm,
 
     const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename;
 
-    if (filename.empty() || !check_if_file_exists(file_path)) {
+    if (filename.empty() || (this->m_check_data_file && !check_if_file_exists(file_path))) {
       LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' does not exist.");
     }
 
@@ -159,7 +158,7 @@ ::read_exclusive_list(std::istream& istrm,
       if (found != excluded_sample_indices.cend()) {
         continue;
       }
-      m_sample_list.emplace_back(index, to_sample_name_t<sample_name_t>(s));
+      this->m_sample_list.emplace_back(index, to_sample_name_t<sample_name_t>(s));
       valid_sample_count++;
     }
 
@@ -173,7 +172,7 @@ ::read_exclusive_list(std::istream& istrm,
 
   if (m_header.get_num_files() != cnt_files) {
     LBANN_ERROR(std::string("Sample list ")
-                + m_header.get_sample_list_filename()
+                + m_header.get_sample_list_name()
                 + std::string(": number of files requested ")
                 + std::to_string(m_header.get_num_files())
                 + std::string(" does not equal number of files loaded ")
@@ -214,7 +213,7 @@ ::read_inclusive_list(std::istream& istrm,
 
     const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename;
 
-    if (filename.empty() || !check_if_file_exists(file_path)) {
+    if (filename.empty() || (this->m_check_data_file && !check_if_file_exists(file_path))) {
       throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
                             + " :: data file '" + filename + "' does not exist.");
     }
@@ -236,7 +235,7 @@ ::read_inclusive_list(std::istream& istrm,
     while(!sstr.eof()) {
       std::string sample_name_str;
       sstr >> sample_name_str;
-      m_sample_list.emplace_back(index, to_sample_name_t<sample_name_t>(sample_name_str));
+      this->m_sample_list.emplace_back(index, to_sample_name_t<sample_name_t>(sample_name_str));
 #ifdef VALIDATE_SAMPLE_LIST
       sample_names.emplace_back(sample_name_str);
 #endif
@@ -296,7 +295,7 @@ ::save( Archive & ar ) const {
   for(auto&& e : m_file_id_stats_map) {
     file_stats.emplace_back(std::make_tuple(std::get<0>(e), std::get<2>(e)));
   }
-  ar(m_header, m_sample_list, file_stats);
+  ar(m_header, this->m_sample_list, file_stats);
 }
 
 template <typename sample_name_t, typename file_handle_t>
@@ -305,7 +304,7 @@ void sample_list_open_files<sample_name_t, file_handle_t>
 ::load( Archive & ar ) {
   using ar_file_stats_t = std::tuple<std::string, std::deque<std::pair<int,int>>>;
   std::vector<ar_file_stats_t> file_stats;
-  ar(m_header, m_sample_list, file_stats);
+  ar(m_header, this->m_sample_list, file_stats);
   m_file_id_stats_map.reserve(file_stats.size());
   for(auto&& e : file_stats) {
     //m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), uninitialized_file_handle<file_handle_t>(), std::deque<std::pair<int,int>>{}));
@@ -318,15 +317,28 @@ template <typename sample_name_t, typename file_handle_t>
 inline bool sample_list_open_files<sample_name_t, file_handle_t>
 ::to_string(std::string& sstr) const {
   std::map<std::string, std::template vector<sample_name_t>> tmp_file_map;
-  for (const auto& s : m_sample_list) {
+  for (const auto& s : this->m_sample_list) {
     const std::string& filename = get_samples_filename(s.first);
     tmp_file_map[filename].emplace_back(s.second);
   }
 
   sstr.clear();
 
-  // reserve the string to hold the entire sample lit
-  size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1;
+  static const size_t max_type_len
+    = std::max(std::max(multi_sample_exclusion.size(),
+                        multi_sample_inclusion.size()),
+               single_sample.size());
+
+  static const size_t max_num_len
+    = std::to_string(std::numeric_limits<size_t>::max()).size();
+
+  // reserve the string to hold the entire sample list
+  size_t estimated_len = max_type_len
+                       + max_num_len * 3 + 2
+                       + m_header.get_file_dir().size()
+                       + m_header.get_label_filename().size()
+                       + 4;
+
   for (const auto& f : tmp_file_map) {
     estimated_len += f.first.size()
                    + std::to_string(f.second.size()).size()
@@ -370,18 +382,6 @@ ::get_num_samples(size_t& total, size_t& included, size_t& excluded) const {
   excluded = total - included;
 }
 
-template <typename sample_name_t, typename file_handle_t>
-inline const typename sample_list_open_files<sample_name_t, file_handle_t>::samples_t&
-sample_list_open_files<sample_name_t, file_handle_t>::get_list() const {
-  return m_sample_list;
-}
-
-template <typename sample_name_t, typename file_handle_t>
-inline const typename sample_list_open_files<sample_name_t, file_handle_t>::sample_t&
-sample_list_open_files<sample_name_t, file_handle_t>::operator[](size_t idx) const {
-  return m_sample_list[idx];
-}
-
 template <typename sample_name_t, typename file_handle_t>
 inline const std::string& sample_list_open_files<sample_name_t, file_handle_t>
 ::get_samples_filename(sample_file_id_t id) const {
@@ -522,14 +522,14 @@ ::all_gather_packed_lists(lbann_comm& comm) {
   }
   m_open_fd_pq.clear();
 
-  size_t num_samples = this->all_gather_field(m_sample_list, per_rank_samples, comm);
+  size_t num_samples = this->all_gather_field(this->m_sample_list, per_rank_samples, comm);
   size_t num_ids = this->all_gather_field(my_files, per_rank_files, comm);
   size_t num_files = this->all_gather_field(m_file_map, per_rank_file_map, comm);
 
-  m_sample_list.clear();
+  this->m_sample_list.clear();
   m_file_id_stats_map.clear();
 
-  m_sample_list.reserve(num_samples);
+  this->m_sample_list.reserve(num_samples);
   m_file_id_stats_map.reserve(num_ids);
   m_file_map.reserve(num_files);
 
@@ -557,10 +557,16 @@ ::all_gather_packed_lists(lbann_comm& comm) {
         }
         index = search_result->second;
       }  
-      m_sample_list.emplace_back(std::make_pair(index, s.second));
+      this->m_sample_list.emplace_back(std::make_pair(index, s.second));
     }
   }
 
+  if (this->m_keep_order) {
+    this->reorder();
+  }
+
+  // For multi-sample per file case, sample names are read from the sample list
+  // file.
   return;
 }
 
@@ -579,7 +585,7 @@ ::compute_epochs_file_usage(const std::vector<int>& shuffled_indices,
   m_open_fd_pq.clear();
   for (size_t i = 0; i < shuffled_indices.size(); i++) {
     int idx = shuffled_indices[i];
-    const auto& s = m_sample_list[idx];
+    const auto& s = this->m_sample_list[idx];
     sample_file_id_t index = s.first;
 
     if((i % mini_batch_size) % comm.get_procs_per_trainer() == static_cast<size_t>(comm.get_rank_in_trainer())) {
@@ -648,7 +654,7 @@ ::manage_open_file_handles(sample_file_id_t id, bool pre_open_fd) {
 template <typename sample_name_t, typename file_handle_t>
 inline file_handle_t sample_list_open_files<sample_name_t, file_handle_t>
 ::open_samples_file_handle(const size_t i, bool pre_open_fd) {
-  const sample_t& s = m_sample_list[i];
+  const sample_t& s = this->m_sample_list[i];
   sample_file_id_t id = s.first;
   file_handle_t h = get_samples_file_handle(id);
   if (!is_file_handle_valid(h)) {
@@ -675,7 +681,7 @@ ::open_samples_file_handle(const size_t i, bool pre_open_fd) {
 template <typename sample_name_t, typename file_handle_t>
 inline void sample_list_open_files<sample_name_t, file_handle_t>
 ::close_if_done_samples_file_handle(const size_t i) {
-  const sample_t& s = m_sample_list[i];
+  const sample_t& s = this->m_sample_list[i];
   sample_file_id_t id = s.first;
   auto h = get_samples_file_handle(id);
   if (!is_file_handle_valid(h)) {
diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp
index 8bb4d50fc25..9af9c0df67f 100644
--- a/include/lbann/proto/proto_common.hpp
+++ b/include/lbann/proto/proto_common.hpp
@@ -46,19 +46,19 @@ class Trainer;
 
 namespace lbann {
 
-/** @brief Customize the name of the index list
+/** @brief Customize the name of the sample list
  *
  *  The following options are available
  *   - trainer ID
  *   - model name
  *
  *  The format for the naming convention if the provided name is
- *  \<index list\> is:
+ *  \<sample list\> is:
  *  @verbatim
-    <index list> == <basename>.<extension>
+    <sample list> == <basename>.<extension>
     <model name>_t<ID>_<basename>.<extension> @endverbatim
  */
-void customize_data_readers_index_list(const lbann_comm& comm,
+void customize_data_readers_sample_list(const lbann_comm& comm,
                                        ::lbann_data::LbannPB& p);
 
 /** @brief instantiates one or more generic_data_readers and inserts
diff --git a/include/lbann/utils/file_utils.hpp b/include/lbann/utils/file_utils.hpp
index 53f9c9b6be0..3b544b48227 100644
--- a/include/lbann/utils/file_utils.hpp
+++ b/include/lbann/utils/file_utils.hpp
@@ -70,7 +70,7 @@ bool check_if_dir_exists(const std::string& dirname);
 /** @todo Deprecated. Use @c lbann::file::make_directory instead. */
 bool create_dir(const std::string output_dir);
 
-bool load_file(const std::string filename, std::vector<char>& buf);
+bool load_file(const std::string filename, std::vector<char>& buf, bool append = false);
 
 inline void __swapEndianInt(unsigned int& ui) {
   ui = ((ui >> 24) | ((ui<<8) & 0x00FF0000) | ((ui>>8) & 0x0000FF00) | (ui << 24));
diff --git a/model_zoo/data_readers/data_reader_jag.prototext b/model_zoo/data_readers/data_reader_jag.prototext
index 6c5dc722528..e6218fa3686 100644
--- a/model_zoo/data_readers/data_reader_jag.prototext
+++ b/model_zoo/data_readers/data_reader_jag.prototext
@@ -14,10 +14,9 @@ data_reader {
     name: "jag_conduit"
     role: "train"
     shuffle: true
-    data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/"
-    index_list: "100Kindex.txt"
-    index_list_per_trainer: false
-    index_list_per_model: false
+    sample_list: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/100Kindex.txt"
+    sample_list_per_trainer: false
+    sample_list_per_model: false
 
     validation_percent: 0
     absolute_sample_count: 0
@@ -34,10 +33,9 @@ data_reader {
     role: "test"
     shuffle: false
     # change to a lustre path
-    data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/"
-    index_list: "t1_sample_list.txt"
-    index_list_per_trainer: false
-    index_list_per_model: false
+    sample_list: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/t1_sample_list.txt"
+    sample_list_per_trainer: false
+    sample_list_per_model: false
 
     validation_percent: 0
     absolute_sample_count: 0
diff --git a/python/lbann/contrib/lc/paths.py b/python/lbann/contrib/lc/paths.py
index fecbf1dec6a..939eb39cc9e 100644
--- a/python/lbann/contrib/lc/paths.py
+++ b/python/lbann/contrib/lc/paths.py
@@ -106,3 +106,50 @@ def imagenet_labels(system = system(), data_set = 'train',
         return os.path.join(label_dir, 'test.txt')
     else:
         raise RuntimeError('unknown ImageNet data set (' + data_set + ')')
+
+def imagenet_sample_list(system = system(), data_set = 'train',
+                         num_classes = 1000):
+    """ImageNet sample_list file on LC system.
+
+    The file contains ground truth labels from the ILSVRC2012
+    competition. It is a plain text file where each line contains an
+    image file path (relative to the ImageNet directory; see the
+    `imagenet_dir` function) and the corresponding label ID.
+
+    There are three available data sets: 'training', 'validation', and
+    'testing'.
+
+    Some of these data sets have been preprocessed to only include
+    images in a subset of the label classes, e.g. images in the first
+    10 label classes. This is convenient for quickly evaluating
+    performance or learning behavior. The availabiilty of these
+    subsampled data sets may vary by system.
+
+    """
+    slist_dir = parallel_file_system_path(system)
+    if system in ('lassen', 'sierra'):
+        slist_dir += 'brainusr/datasets/ILSVRC2012/sample_list/'
+    else:
+        slist_dir += 'brainusr/datasets/ILSVRC2012/sample_list/'
+    suffixes = {1000: '', 10: '_c0-9', 100: '_c0-99',
+                200: '_c100-299', 300: '_c0-299'}
+    if data_set.lower() in ('train', 'training'):
+        if num_classes in suffixes.keys():
+            return os.path.join(slist_dir,
+                                'train' + suffixes[num_classes] + '_sample_list.txt')
+        else:
+            raise RuntimeError('invalid number of classes ({0}) '
+                               'for ImageNet data set ({1})'
+                               .format(num_classes, data_set))
+    elif data_set.lower() in ('val', 'validation'):
+        if num_classes in suffixes.keys():
+            return os.path.join(slist_dir,
+                                'val' + suffixes[num_classes] + '_sample_list.txt')
+        else:
+            raise RuntimeError('invalid number of classes ({0}) '
+                               'for ImageNet data set ({1})'
+                               .format(num_classes, data_set))
+    elif data_set.lower() in ('test', 'testing'):
+        return os.path.join(slist_dir, 'test_sample_list.txt')
+    else:
+        raise RuntimeError('unknown ImageNet data set (' + data_set + ')')
diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp
index a3669ff8572..4530044373f 100644
--- a/src/data_readers/data_reader.cpp
+++ b/src/data_readers/data_reader.cpp
@@ -623,17 +623,20 @@ std::string generic_data_reader::get_local_file_dir() const {
   return m_local_file_dir;
 }
 
-void generic_data_reader::set_data_index_list(std::string s) {
-  m_data_index_list = s;
+void generic_data_reader::set_data_sample_list(std::string s) {
+  m_data_sample_list = s;
 }
 
-std::string generic_data_reader::get_data_index_list() const {
-  if (m_data_index_list == "") {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: you apparently did not call set_data_index_list; error!");
-  }
-  return m_data_index_list;
+std::string generic_data_reader::get_data_sample_list() const {
+  return m_data_sample_list;
+}
+
+void generic_data_reader::keep_sample_order(bool same_order) {
+  // The sample_list::keep_sample_order() should be called using this
+  // flag. By doing so, it will add additional step to re-shuffle the
+  // sample order to restore it to the original before the loading
+  // with interleaving accesses by multiple ranks in a trainer.
+  m_keep_sample_order = same_order;
 }
 
 void generic_data_reader::set_data_filename(std::string s) {
@@ -866,7 +869,7 @@ void generic_data_reader::print_get_methods(const std::string filename) {
 
   out << "get_file_dir " << get_file_dir() << std::endl;
   out << "get_local_file_dir " << get_local_file_dir() << std::endl;
-  out << "get_data_index_list " << get_data_index_list() << std::endl;
+  out << "get_data_sample_list " << get_data_sample_list() << std::endl;
   out << "get_data_filename " << get_data_filename()  << std::endl;
   out << "get_label_filename " << get_label_filename() << std::endl;
   out << "get_role " << get_role() << std::endl;
diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp
index 593a7413823..a11d2d078c2 100644
--- a/src/data_readers/data_reader_image.cpp
+++ b/src/data_readers/data_reader_image.cpp
@@ -49,19 +49,26 @@ image_data_reader::image_data_reader(const image_data_reader& rhs)
 }
 
 image_data_reader& image_data_reader::operator=(const image_data_reader& rhs) {
+  if (this == &rhs) {
+    return (*this);
+  }
   generic_data_reader::operator=(rhs);
   m_image_dir = rhs.m_image_dir;
-  m_image_list = rhs.m_image_list;
+  m_labels = rhs.m_labels;
   m_image_width = rhs.m_image_width;
   m_image_height = rhs.m_image_height;
   m_image_num_channels = rhs.m_image_num_channels;
   m_image_linearized_size = rhs.m_image_linearized_size;
   m_num_labels = rhs.m_num_labels;
+  m_sample_list.copy(rhs.m_sample_list);
 
   return (*this);
 }
 
 void image_data_reader::copy_members(const image_data_reader &rhs) {
+  if (this == &rhs) {
+    return;
+  }
 
   if(rhs.m_data_store != nullptr) {
     m_data_store = new data_store_conduit(rhs.get_data_store());
@@ -69,12 +76,13 @@ void image_data_reader::copy_members(const image_data_reader &rhs) {
   }
 
   m_image_dir = rhs.m_image_dir;
-  m_image_list = rhs.m_image_list;
+  m_labels = rhs.m_labels;
   m_image_width = rhs.m_image_width;
   m_image_height = rhs.m_image_height;
   m_image_num_channels = rhs.m_image_num_channels;
   m_image_linearized_size = rhs.m_image_linearized_size;
   m_num_labels = rhs.m_num_labels;
+  m_sample_list.copy(rhs.m_sample_list);
   //m_thread_cv_buffer = rhs.m_thread_cv_buffer
 }
 
@@ -118,7 +126,10 @@ void image_data_reader::set_input_params(const int width, const int height, cons
 }
 
 bool image_data_reader::fetch_label(CPUMat& Y, int data_id, int mb_idx) {
-  const label_t label = m_image_list[data_id].second;
+  if (static_cast<size_t>(data_id) >=  m_labels.size()) {
+    LBANN_ERROR("Cannot find label for sample " +  std::to_string(data_id) + ".");
+  }
+  const label_t label = m_labels[data_id];
   if (label < label_t{0} || label >= static_cast<label_t>(m_num_labels)) {
     LBANN_ERROR(
       "\"",this->get_type(),"\" data reader ",
@@ -129,32 +140,55 @@ bool image_data_reader::fetch_label(CPUMat& Y, int data_id, int mb_idx) {
   return true;
 }
 
+void image_data_reader::dump_sample_label_list(const std::string& dump_file_name) {
+  std::ofstream os(dump_file_name);
+  const auto num_samples = m_sample_list.size();
+  for (size_t i = 0ul; i < num_samples; ++i) {
+    const auto file_id = m_sample_list[i].first;
+    const std::string filename = m_sample_list.get_samples_filename(file_id);
+    os << filename << ' ' << std::to_string(m_labels[i]) << std::endl;
+  }
+}
+
 void image_data_reader::load() {
   options *opts = options::get();
 
-  const std::string imageListFile = get_data_filename();
+  // Load sample list
+  const std::string sample_list_file = get_data_sample_list();
 
-  // load image list
-  m_image_list.clear();
-  FILE *fplist = fopen(imageListFile.c_str(), "rt");
-  if (!fplist) {
-    LBANN_ERROR("failed to open: " + imageListFile + " for reading");
-  }
-  while (!feof(fplist)) {
-    char imagepath[512];
-    label_t imagelabel;
-    if (fscanf(fplist, "%s%d", imagepath, &imagelabel) <= 1) {
-      break;
+  if (sample_list_file.empty()) {
+    gen_list_of_samples();
+  } else {
+    load_list_of_samples(sample_list_file);
+  }
+
+  if (opts->has_string("write_sample_list") && m_comm->am_trainer_master()) {
+    const std::string slist_name = (m_sample_list.get_header()).get_sample_list_name();
+    std::stringstream s;
+    std::string basename = get_basename_without_ext(slist_name);
+    std::string ext = get_ext_name(slist_name);
+    s << basename << "." << ext;
+    {
+      const std::string msg = " writing sample list '" + slist_name
+                            + "' as '" + s.str() + "'";
+      LBANN_WARNING(msg);
+    }
+    m_sample_list.write(s.str());
+  }
+  if (opts->has_string("write_sample_label_list") && m_comm->am_trainer_master()) {
+    if (!(m_keep_sample_order || opts->has_string("keep_sample_order"))) {
+    std::cout << "Writting sample label list without the option "
+              << "`keep_sample_order' set." << std::endl;
     }
-    m_image_list.emplace_back(imagepath, imagelabel);
+    std::string dump_file = "image_list.trainer"
+                          + std::to_string(m_comm->get_trainer_rank())
+                          + "." + this->get_role() + ".txt";
+    dump_sample_label_list(dump_file);
   }
-  fclose(fplist);
 
-  // TODO: this will probably need to change after sample_list class
-  //       is modified
   // reset indices
   m_shuffled_indices.clear();
-  m_shuffled_indices.resize(m_image_list.size());
+  m_shuffled_indices.resize(m_sample_list.size());
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
   resize_shuffled_indices();
 
@@ -164,6 +198,15 @@ void image_data_reader::load() {
   select_subset_of_data();
 }
 
+image_data_reader::sample_t image_data_reader::get_sample(const size_t idx) const {
+  if (idx >=  m_labels.size()) {
+    LBANN_ERROR("Cannot find label for sample " +  std::to_string(idx) + ".");
+  }
+  const auto sample_name = m_sample_list[idx].second;
+  const auto label = m_labels[idx];
+  return sample_t(sample_name, label);
+}
+
 void read_raw_data(const std::string &filename, std::vector<char> &data) {
   data.clear();
   std::ifstream in(filename.c_str());
@@ -215,7 +258,6 @@ void image_data_reader::do_preload_data_store() {
     load_conduit_nodes_from_file(data_ids[io_thread_pool->get_local_thread_id()]);
     io_thread_pool->finish_work_group();
   }
-
   else {
     if (is_master()) {
       std::cout << "mode: NOT data_store_thread\n";
@@ -240,12 +282,6 @@ void image_data_reader::setup(int num_io_threads, observer_ptr<thread_pool> io_t
      static_cast<size_t>(m_image_width)});
 }
 
-std::vector<image_data_reader::sample_t> image_data_reader::get_image_list_of_current_mb() const {
-  std::vector<sample_t> ret;
-  ret.reserve(m_mini_batch_size);
-  return ret;
-}
-
 bool image_data_reader::load_conduit_nodes_from_file(const std::unordered_set<int> &data_ids) {
   for (auto data_id : data_ids) {
     conduit::Node &node = m_data_store->get_empty_node(data_id);
@@ -257,8 +293,15 @@ bool image_data_reader::load_conduit_nodes_from_file(const std::unordered_set<in
 
 void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &node) {
   node.reset();
-  const std::string filename = get_file_dir() + m_image_list[data_id].first;
-  int label = m_image_list[data_id].second;
+
+  const auto file_id = m_sample_list[data_id].first;
+  const std::string filename = get_file_dir() + m_sample_list.get_samples_filename(file_id);
+
+  if (static_cast<size_t>(data_id) >=  m_labels.size()) {
+    LBANN_ERROR("Cannot find label for sample " +  std::to_string(data_id) + ".");
+  }
+  const label_t label = m_labels[data_id];
+
   std::vector<char> data;
   read_raw_data(filename, data);
   node[LBANN_DATA_ID_STR(data_id) + "/label"].set(label);
@@ -266,5 +309,291 @@ void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &
   node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"] = data.size();
 }
 
+/// Allow streams to be constructed on an existing data buffer without copying
+template<typename CharT, typename Traits = std::char_traits<CharT> >
+class vectorwrapbuf : public std::basic_streambuf<CharT, Traits> {
+public:
+    vectorwrapbuf(std::vector<CharT> &vec) {
+        this->setg(vec.data(), vec.data(), vec.data() + vec.size());
+    }
+};
+
+/**
+ * Load a sample list and then load labels from a separate file using `load_labels()`
+ * With the command line option `--load_full_sample_list_once`, the trainer master
+ * first loads the entire sample list file into a memory buffer, and broadcasts it
+ * to the other workers within the trainer. Then, the sample list is populated
+ * using the buffer content. Otherwise, the sample list is directly read from the
+ * file. The prototext variable `data_filedir` when specified overrides the base
+ * location of data files, written in the header of the sample list file.
+ * The option `keep_sample_order` from the command line or data reader prototexts,
+ * makes sure the order of samples in the list remains the same even with loading
+ * in an interleaving order by multiple trainer workers.
+ */
+void image_data_reader::load_list_of_samples(const std::string sample_list_file) {
+  // load the sample list
+  double tm1 = get_time();
+
+  options *opts = options::get();
+
+  if (m_keep_sample_order || opts->has_string("keep_sample_order")) {
+    m_sample_list.keep_sample_order(true);
+  } else {
+    m_sample_list.keep_sample_order(false);
+  }
+
+  if (opts->get_bool("check_data")) {
+    m_sample_list.set_data_file_check();
+  }
+
+  std::vector<char> buffer;
+
+  if (opts->has_string("load_full_sample_list_once")) {
+    if (m_comm->am_trainer_master()) {
+      load_file(sample_list_file, buffer);
+    }
+    m_comm->trainer_broadcast(m_comm->get_trainer_master(), buffer);
+
+    vectorwrapbuf<char> strmbuf(buffer);
+    std::istream iss(&strmbuf);
+
+    m_sample_list.set_sample_list_name(sample_list_file);
+    m_sample_list.load(iss, *m_comm, true);
+  } else {
+    m_sample_list.load(sample_list_file, *m_comm, true);
+  }
+
+  double tm2 = get_time();
+
+  if (is_master()) {
+    std::cout << "Time to load sample list '" << sample_list_file << "': "
+              << tm2 - tm1 << std::endl;
+  }
+
+  /// Merge all the sample list pieces from the workers within the trainer
+  m_sample_list.all_gather_packed_lists(*m_comm);
+  set_file_dir(m_sample_list.get_samples_dirname());
+
+  double tm3 = get_time();
+  if(is_master()) {
+    std::cout << "Time to gather sample list '" << sample_list_file << "': "
+              << tm3 - tm2 << std::endl;
+  }
+  buffer.clear();
+  buffer.shrink_to_fit();
+
+  std::vector<char> empty_buffer;
+  load_labels(empty_buffer);
+}
+
+void image_data_reader::load_list_of_samples_from_archive(const std::string& sample_list_archive) {
+  // load the sample list
+  double tm1 = get_time();
+  std::stringstream ss(sample_list_archive); // any stream can be used
+
+  cereal::BinaryInputArchive iarchive(ss); // Create an input archive
+
+  iarchive(m_sample_list); // Read the data from the archive
+  double tm2 = get_time();
+
+  if (is_master()) {
+    std::cout << "Time to load sample list from archive: " << tm2 - tm1 << std::endl;
+  }
+}
+
+/**
+ * Similar to `load_list_of_samples()` but generates the sample list header
+ * on-the-fly, and reuse the original imagenet data list file for loading both
+ * the sample list and the label list, of which path is specified via the
+ * prototext variable `data_filedir`. This is for the backward compatibility
+ * and allows users to use the old data reader prototext without preparing a
+ * sample list and modifying the prototext. The base location of data files
+ * is specified via `data_filedir` prototext variable as it was.
+ */
+void image_data_reader::gen_list_of_samples() {
+  // load the sample list
+  double tm1 = get_time();
+
+  // The original imagenet data file specified via the prototext variable
+  // `data_filename`
+  const std::string imageListFile = get_data_filename();
+
+  sample_list_header header; // A sample list header being generated
+  header.set_sample_list_type(lbann::single_sample);
+  header.set_data_file_dir(get_file_dir());
+  header.set_label_filename(imageListFile);
+  const std::string sample_list_file = imageListFile;
+  header.set_sample_list_name(sample_list_file);
+
+  options *opts = options::get();
+
+  if (m_keep_sample_order || opts->has_string("keep_sample_order")) {
+    m_sample_list.keep_sample_order(true);
+  } else {
+    m_sample_list.keep_sample_order(false);
+  }
+
+  if (opts->get_bool("check_data")) {
+    m_sample_list.set_data_file_check();
+  }
+
+  std::vector<char> buffer;
+
+  if (opts->has_string("load_full_sample_list_once")) {
+    // The trainer master loads the entire file into a buffer in the memory
+    if (m_comm->am_trainer_master()) {
+      load_file(imageListFile, buffer);
+    }
+    // Broadcast the buffer to workers within this trainer
+    m_comm->trainer_broadcast(m_comm->get_trainer_master(), buffer);
+
+    // The trainer master counts the number of samples (lines) and broadcasts
+    // the result
+    size_t num_samples = 0ul;
+    if (m_comm->am_trainer_master()) {
+      vectorwrapbuf<char> strmbuf(buffer);
+      std::istream iss(&strmbuf);
+      num_samples = determine_num_of_samples(iss);
+    }
+    m_comm->trainer_broadcast(m_comm->get_trainer_master(), num_samples);
+    header.set_sample_count(std::to_string(num_samples));
+
+    // Populate the sample list using the generated header and the preloaded buffer
+    vectorwrapbuf<char> strmbuf(buffer);
+    std::istream iss(&strmbuf);
+    m_sample_list.load(header, iss, *m_comm, true);
+  } else {
+    // The trainer master counts the number of samples (lines) and broadcasts
+    // the result
+    size_t num_samples = 0ul;
+    if (m_comm->am_trainer_master()) {
+      std::ifstream iss(imageListFile);
+      num_samples = determine_num_of_samples(iss);
+    }
+    m_comm->trainer_broadcast(m_comm->get_trainer_master(), num_samples);
+    header.set_sample_count(std::to_string(num_samples));
+
+    // Populate the sample list using the generated header and the original
+    // imagenet data list file
+    std::ifstream iss(imageListFile);
+    m_sample_list.load(header, iss, *m_comm, true);
+  }
+
+  double tm2 = get_time();
+
+  if (is_master()) {
+    std::cout << "Time to load sample list '" << sample_list_file << "': "
+              << tm2 - tm1 << std::endl;
+  }
+
+  /// Merge all the sample list pieces from the workers within the trainer
+  m_sample_list.all_gather_packed_lists(*m_comm);
+
+  double tm3 = get_time();
+  if(is_master()) {
+    std::cout << "Time to gather sample list '" << sample_list_file << "': "
+              << tm3 - tm2 << std::endl;
+  }
+  // Reuse the preloaded buffer for obtaining labels when possible
+  load_labels(buffer);
+}
+
+/// Populate the sample label vector out of the given input stream
+void image_data_reader::read_labels(std::istream& istrm) {
+  const std::string whitespaces(" \t\f\v\n\r");
+  const size_t num_samples = m_sample_list.size();
+
+  // To help populating the label list, build a map from a sample name to
+  // the index of the corresponding item in the sample list
+  m_sample_list.build_sample_map_from_name_to_index();
+
+  options *opts = options::get();
+  const bool check_data = opts->get_bool("check_data");
+
+  m_labels.clear();
+  m_labels.resize(num_samples);
+  std::unordered_set<sample_idx_t> idx_set;
+
+  std::string line;
+
+  while (std::getline(istrm, line)) {
+    const size_t end_of_str = line.find_last_not_of(whitespaces);
+    if (end_of_str == std::string::npos) { // empty line
+      continue;
+    }
+
+    // clear trailing spaces for accurate parsing
+    std::stringstream sstr(line.substr(0, end_of_str + 1));
+    std::string sname;
+    label_t label;
+
+    sstr >> sname >> label;
+
+    // Translate the sample name into the index into the sample list
+    const auto sample_idx = m_sample_list.get_sample_index(sample_name_t(sname));
+    if (sample_idx >= num_samples) {
+      continue;
+    }
+    if (check_data) {
+      idx_set.insert(sample_idx);
+    }
+    m_labels[sample_idx] = label;
+  }
+
+  // Free the memory of the temporary map
+  m_sample_list.clear_sample_map_from_name_to_index();
+
+  if (check_data && (num_samples != idx_set.size())) {
+    LBANN_ERROR("The number of samples is different from the number of labels: ",
+                std::to_string(num_samples),
+                " != ",
+                std::to_string(idx_set.size()));
+  }
+}
+
+/**
+ * Load the sample labels either from a file or from a preloaded buffer.
+ * If the buffer given is empty, the label file specified in the sample list
+ * header is used.
+ */
+void image_data_reader::load_labels(std::vector<char>& preloaded_buffer) {
+  const std::string imageListFile = m_sample_list.get_label_filename();
+
+  double tm1 = get_time();
+
+  if (preloaded_buffer.empty()) { // read labels from a file
+    std::string line;
+    std::ifstream is;
+    is.open(imageListFile);
+    if (is.fail()) {
+      LBANN_ERROR("failed to open: " + imageListFile + " for reading");
+    }
+    read_labels(is);
+  } else { // read labels from a preloaded buffer
+    vectorwrapbuf<char> strmbuf(preloaded_buffer);
+    std::istream is(&strmbuf);
+    read_labels(is);
+  }
+
+  if (is_master()) {
+    std::cout << "Time to load label file '" << imageListFile << "': "
+              << get_time() - tm1 << std::endl;
+  }
+}
+
+size_t image_data_reader::determine_num_of_samples(std::istream& istrm) const {
+  const std::string whitespaces(" \t\f\v\n\r");
+  size_t cnt = 0ul;
+  std::string line;
+
+  while (std::getline(istrm, line)) {
+    const size_t end_of_str = line.find_last_not_of(whitespaces);
+    if (end_of_str == std::string::npos) { // empty line
+      continue;
+    }
+    cnt ++;
+  }
+  return cnt;
+}
 
 }  // namespace lbann
diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp
index 0d83fc679ad..2089e4f9a22 100644
--- a/src/data_readers/data_reader_imagenet.cpp
+++ b/src/data_readers/data_reader_imagenet.cpp
@@ -54,7 +54,10 @@ CPUMat imagenet_reader::create_datum_view(CPUMat& X, const int mb_idx) const {
 bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
   El::Matrix<uint8_t> image;
   std::vector<size_t> dims;
-  const std::string image_path = get_file_dir() + m_image_list[data_id].first;
+  const auto file_id = m_sample_list[data_id].first;
+  const std::string filename = m_sample_list.get_samples_filename(file_id);
+  const std::string image_path = get_file_dir() + filename;
+
   if (m_data_store != nullptr) {
     bool have_node = true;
     conduit::Node node;
diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp
index 76eb78685c8..6266e537b0e 100644
--- a/src/data_readers/data_reader_jag_conduit.cpp
+++ b/src/data_readers/data_reader_jag_conduit.cpp
@@ -54,6 +54,7 @@
 
 #include <cereal/archives/binary.hpp>
 #include <sstream>
+#include <fstream>
 
 // This comes after all the headers, and is only visible within the current implementation file.
 // To make sure, we put '#undef _CN_' at the end of this file
@@ -786,58 +787,15 @@ void data_reader_jag_conduit::load() {
   if(is_master()) {
     std::cout << "data_reader_jag_conduit - starting load" << std::endl;
   }
-  const std::string data_dir = add_delimiter(get_file_dir());
-  const std::string sample_list_file = data_dir + get_data_index_list();
+  const std::string sample_list_file = get_data_sample_list();
 
-  options *opts = options::get();
-  bool check_data = opts->get_bool("check_data");
-
-  /// The use of these flags need to be updated to properly separate
-  /// how index lists are used between trainers and models
-  /// @todo m_list_per_trainer || m_list_per_model
-  double tm2 = get_time();
-  load_list_of_samples(sample_list_file, m_comm->get_procs_per_trainer(), m_comm->get_rank_in_trainer());
-  if(is_master()) {
-      std::cout << "Finished loading sample list; time: " << get_time() - tm2 << std::endl;
-    if (!check_data) {
-      std::cout << "Skipping check data" << std::endl;
-    }
+  if (sample_list_file.empty()) {
+    LBANN_ERROR("sample list is not specified.");
   }
 
-  /// Check the data that each rank loaded
-  if (!m_is_data_loaded && !m_sample_list.empty()) {
-    m_is_data_loaded = true;
-
-    /// Open the first sample to make sure that all of the fields are correct
-    m_sample_list.open_samples_file_handle(0, true);
+  load_list_of_samples(sample_list_file);
 
-    if (m_scalar_keys.size() == 0u) {
-      set_all_scalar_choices(); // use all by default if none is specified
-    }
-    if (check_data) {
-      check_scalar_keys();
-    }
-
-    if (m_input_keys.size() == 0u) {
-      set_all_input_choices(); // use all by default if none is specified
-    }
-    if (check_data) {
-      check_input_keys();
-    }
-
-    if (check_data) {
-      check_image_data();
-    }
-
-    m_sample_list.close_if_done_samples_file_handle(0);
-  }
-  if(is_master()) {
-    std::cout << "Done with data checking" << std::endl;
-  }
-
-  /// Merge all of the sample lists
-  tm2 = get_time();
-  m_sample_list.all_gather_packed_lists(*m_comm);
+  options *opts = options::get();
   if (opts->has_string("write_sample_list") && m_comm->am_trainer_master()) {
     {
       const std::string msg = " writing sample list " + sample_list_file;
@@ -849,18 +807,12 @@ void data_reader_jag_conduit::load() {
     s << basename << "." << ext;
     m_sample_list.write(s.str());
   }
-  if (is_master()) {
-    std::cout << "time for all_gather_packed_lists: " << get_time() - tm2 << std::endl;
-  }
 
+  m_shuffled_indices.clear();
   m_shuffled_indices.resize(m_sample_list.size());
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
   resize_shuffled_indices();
 
-  if(is_master()) {
-    std::cout << "Lists have been gathered" << std::endl;
-  }
-
   instantiate_data_store();
   select_subset_of_data();
 }
@@ -926,14 +878,103 @@ void data_reader_jag_conduit::do_preload_data_store() {
   }
 }
 
-void data_reader_jag_conduit::load_list_of_samples(const std::string sample_list_file, size_t stride, size_t offset) {
+void data_reader_jag_conduit::sample_schema_check(const bool check_data) {
+  /// Check the data that each rank loaded
+  if (!m_is_data_loaded && !m_sample_list.empty()) {
+    m_is_data_loaded = true;
+
+    /// Open the first sample to make sure that all of the fields are correct
+    m_sample_list.open_samples_file_handle(0, true);
+
+    if (m_scalar_keys.size() == 0u) {
+      set_all_scalar_choices(); // use all by default if none is specified
+    }
+    if (check_data) {
+      check_scalar_keys();
+    }
+
+    if (m_input_keys.size() == 0u) {
+      set_all_input_choices(); // use all by default if none is specified
+    }
+    if (check_data) {
+      check_input_keys();
+    }
+
+    if (check_data) {
+      check_image_data();
+    }
+
+    m_sample_list.close_if_done_samples_file_handle(0);
+  }
+}
+
+template<typename CharT, typename Traits = std::char_traits<CharT> >
+class vectorwrapbuf : public std::basic_streambuf<CharT, Traits> {
+public:
+    vectorwrapbuf(std::vector<CharT> &vec) {
+        this->setg(vec.data(), vec.data(), vec.data() + vec.size());
+    }
+};
+
+void data_reader_jag_conduit::load_list_of_samples(const std::string sample_list_file) {
   // load the sample list
   double tm1 = get_time();
-  m_sample_list.load(sample_list_file, stride, offset);
+
+  options *opts = options::get();
+
+  if (this->m_keep_sample_order || opts->has_string("keep_sample_order")) {
+    m_sample_list.keep_sample_order(true);
+  } else {
+    m_sample_list.keep_sample_order(false);
+  }
+
+  const bool check_data = opts->get_bool("check_data");
+
+  if (check_data) {
+    m_sample_list.set_data_file_check();
+  }
+
+  std::vector<char> buffer;
+
+  if (opts->has_string("load_full_sample_list_once")) {
+    if (m_comm->am_trainer_master()) {
+      load_file(sample_list_file, buffer);
+    }
+    m_comm->trainer_broadcast(m_comm->get_trainer_master(), buffer);
+
+    vectorwrapbuf<char> strmbuf(buffer);
+    std::istream iss(&strmbuf);
+
+    m_sample_list.set_sample_list_name(sample_list_file);
+    m_sample_list.load(iss, *(this->m_comm), true);
+  } else {
+    m_sample_list.load(sample_list_file, *(this->m_comm), true);
+  }
+
   double tm2 = get_time();
 
   if (is_master()) {
-    std::cout << "Time to load sample list: " << tm2 - tm1 << std::endl;
+    std::cout << "Time to load sample list '" << sample_list_file << "': " << tm2 - tm1 << std::endl;
+  }
+
+  sample_schema_check(check_data);
+
+  double tm3 = get_time();
+  if (is_master()) {
+    if (!check_data) {
+      std::cout << "Skip data checking" << std::endl;
+    } else {
+      std::cout << "Time to check sample data: " << tm3 - tm2 << std::endl;
+    }
+  }
+
+  /// Merge all of the sample lists
+  m_sample_list.all_gather_packed_lists(*m_comm);
+  set_file_dir(m_sample_list.get_samples_dirname());
+
+  double tm4 = get_time();
+  if(is_master()) {
+    std::cout << "Time to gather sample list '" << sample_list_file << "': " << tm4 - tm3 << std::endl;
   }
 }
 
diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp
index 57a4e0222f9..27295025bae 100644
--- a/src/data_store/data_store_conduit.cpp
+++ b/src/data_store/data_store_conduit.cpp
@@ -991,7 +991,7 @@ void data_store_conduit::get_image_sizes(map_is_t &file_sizes, std::vector<std::
   if (image_reader == nullptr) {
     LBANN_ERROR("data_reader_image *image_reader = dynamic_cast<data_reader_image*>(m_reader) failed");
   }
-  const std::vector<image_data_reader::sample_t> &image_list = image_reader->get_image_list();
+  const auto& sample_list = image_reader->get_sample_list();
   std::vector<size_t> my_image_sizes;
 
   // this block fires if we're exchanging cache data at the end
@@ -1008,10 +1008,12 @@ void data_store_conduit::get_image_sizes(map_is_t &file_sizes, std::vector<std::
     // get sizes of files for which I'm responsible
     for (size_t h=m_rank_in_trainer; h<m_shuffled_indices->size(); h += m_np_in_trainer) {
       ++m_my_num_indices;
-      const std::string fn = m_reader->get_file_dir() + '/' + image_list[(*m_shuffled_indices)[h]].first;
+      const auto file_id = sample_list[(*m_shuffled_indices)[h]].first;
+      const std::string fn = m_reader->get_file_dir() + '/'
+                           + sample_list.get_samples_filename(file_id);
       std::ifstream in(fn.c_str());
       if (!in) {
-        LBANN_ERROR("failed to open ", fn, " for reading; file_dir: ", m_reader->get_file_dir(), "  fn: ", image_list[h].first, "; role: ", m_reader->get_role());
+        LBANN_ERROR("failed to open ", fn, " for reading ", fn, "; role: " + m_reader->get_role());
       }
       in.seekg(0, std::ios::end);
       my_image_sizes.push_back((*m_shuffled_indices)[h]);
@@ -1036,7 +1038,7 @@ void data_store_conduit::get_image_sizes(map_is_t &file_sizes, std::vector<std::
     disp[h+1] = disp[h] + counts[h];
   }
 
-  std::vector<size_t> work(image_list.size()*2);
+  std::vector<size_t> work(sample_list.size()*2);
   m_comm->trainer_all_gather<size_t>(my_image_sizes, work, counts, disp);
   indices.resize(m_np_in_trainer);
   for (int h=0; h<m_np_in_trainer; h++) {
@@ -1211,7 +1213,7 @@ void data_store_conduit::read_files(std::vector<char> &work, map_is_t &sizes, st
 
   //get the list of images from the data reader
   image_data_reader *image_reader = dynamic_cast<image_data_reader*>(m_reader);
-  const std::vector<image_data_reader::sample_t> &image_list = image_reader->get_image_list();
+  const auto& sample_list = image_reader->get_sample_list();
 
   //read the images
   size_t offset = 0;
@@ -1219,7 +1221,9 @@ void data_store_conduit::read_files(std::vector<char> &work, map_is_t &sizes, st
   for (size_t j=0; j<indices.size(); ++j) {
     int idx = indices[j];
     size_t s = sizes[idx];
-    const std::string fn = m_reader->get_file_dir() + '/' + image_list[idx].first;
+    const auto file_id = sample_list[idx].first;
+    const std::string fn = m_reader->get_file_dir() + '/'
+                         + sample_list.get_samples_filename(file_id);
     std::ifstream in(fn, std::ios::in | std::ios::binary);
     in.read(work.data()+offset, s);
     in.close();
@@ -1229,10 +1233,10 @@ void data_store_conduit::read_files(std::vector<char> &work, map_is_t &sizes, st
 
 void data_store_conduit::build_conduit_nodes(map_is_t &sizes) {
   image_data_reader *image_reader = dynamic_cast<image_data_reader*>(m_reader);
-  const std::vector<image_data_reader::sample_t> &image_list = image_reader->get_image_list();
   for (auto t : sizes) {
     int data_id = t.first;
-    int label = image_list[data_id].second;
+    const auto sample = image_reader->get_sample(static_cast<size_t>(data_id));
+    const auto label = sample.second;
     if (m_image_offsets.find(data_id) == m_image_offsets.end()) {
       LBANN_ERROR("m_image_offsets.find(data_id) == m_image_offsets.end() for data_id: ", data_id);
     }
diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp
index aee65bb7760..d3e3b128a80 100644
--- a/src/proto/proto_common.cpp
+++ b/src/proto/proto_common.cpp
@@ -102,6 +102,8 @@ void init_data_readers(
       set_transform_pipeline = false;
     } else if ((name == "imagenet")) {
       init_image_data_reader(readme, pb_metadata, master, reader);
+      reader->set_data_sample_list(readme.sample_list());
+      reader->keep_sample_order(readme.sample_list_keep_order());
       set_transform_pipeline = false;
     } else if (name == "jag_conduit") {
       init_image_data_reader(readme, pb_metadata, master, reader);
@@ -110,9 +112,10 @@ void init_data_readers(
       const lbann_data::Model& pb_model = p.model();
       const lbann_data::Trainer& pb_trainer = p.trainer();
       reader->set_mini_batch_size(static_cast<int>(pb_trainer.mini_batch_size()));
-      reader->set_data_index_list(readme.index_list());
-      reader_jag_conduit->set_list_per_trainer(readme.index_list_per_trainer());
-      reader_jag_conduit->set_list_per_model(readme.index_list_per_model());
+      reader->set_data_sample_list(readme.sample_list());
+      reader_jag_conduit->set_list_per_trainer(readme.sample_list_per_trainer());
+      reader_jag_conduit->set_list_per_model(readme.sample_list_per_model());
+      reader_jag_conduit->keep_sample_order(readme.sample_list_keep_order());
 
       /// Allow the prototext to control if the data readers is
       /// shareable for each phase training, validation, or testing
@@ -656,18 +659,18 @@ void set_data_readers_filenames(
   }
 }
 
-void set_data_readers_index_list(
+void set_data_readers_sample_list(
   const std::string& which, lbann_data::LbannPB& p)
 {
   options *opts = options::get();
   lbann_data::DataReader *readers = p.mutable_data_reader();
   int size = readers->reader_size();
-  const std::string key_role = "index_list_" + which;
+  const std::string key_role = "sample_list_" + which;
 
   for (int j=0; j<size; j++) {
     lbann_data::Reader *r = readers->mutable_reader(j);
     if (r->role() == which) {
-      r->set_index_list(opts->get_string(key_role));
+      r->set_sample_list(opts->get_string(key_role));
     }
   }
 }
@@ -690,7 +693,7 @@ void set_data_readers_percent(lbann_data::LbannPB& p)
   }
 }
 
-void customize_data_readers_index_list(const lbann_comm& comm, lbann_data::LbannPB& p)
+void customize_data_readers_sample_list(const lbann_comm& comm, lbann_data::LbannPB& p)
 {
   lbann_data::DataReader *readers = p.mutable_data_reader();
   const lbann_data::Model& pb_model = p.model();
@@ -698,17 +701,26 @@ void customize_data_readers_index_list(const lbann_comm& comm, lbann_data::Lbann
   for (int j=0; j<size; j++) {
     lbann_data::Reader *r = readers->mutable_reader(j);
     std::ostringstream s;
-    std::string basename = get_basename_without_ext(r->index_list());
-    std::string ext = get_ext_name(r->index_list());
-    if(r->index_list_per_model()) {
+    std::string basename = get_basename_without_ext(r->sample_list());
+    std::string ext = get_ext_name(r->sample_list());
+    std::string dir = lbann::file::extract_parent_directory(r->sample_list());
+    if ((r->sample_list()).empty()) {
+      continue;
+    }
+    if (dir.empty()) {
+      dir = ".";
+    }
+
+    s << dir << '/';
+    if(r->sample_list_per_model()) {
       s << pb_model.name() << "_";
     }
-    if(r->index_list_per_trainer()) {
+    if(r->sample_list_per_trainer()) {
       s << "t" << comm.get_trainer_rank() << "_";
     }
     s << basename;
     s << "." << ext;
-    r->set_index_list(s.str());
+    r->set_sample_list(s.str());
   }
 }
 
@@ -737,17 +749,26 @@ void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p)
       or opts->has_string("label_filename_train")) {
     set_data_readers_filenames("train", p);
   }
+  if (opts->has_string("data_filedir")
+      or opts->has_string("data_filedir_validate")
+      or opts->has_string("data_filename_validate")
+      or opts->has_string("label_filename_validate")) {
+    set_data_readers_filenames("validate", p);
+  }
   if (opts->has_string("data_filedir")
       or opts->has_string("data_filedir_test")
       or opts->has_string("data_filename_test")
       or opts->has_string("label_filename_test")) {
     set_data_readers_filenames("test", p);
   }
-  if (opts->has_string("index_list_train")) {
-    set_data_readers_index_list("train", p);
+  if (opts->has_string("sample_list_train")) {
+    set_data_readers_sample_list("train", p);
+  }
+  if (opts->has_string("sample_list_validate")) {
+    set_data_readers_sample_list("validate", p);
   }
-  if (opts->has_string("index_list_test")) {
-    set_data_readers_index_list("test", p);
+  if (opts->has_string("sample_list_test")) {
+    set_data_readers_sample_list("test", p);
   }
   if (opts->has_string("data_reader_percent")) {
     set_data_readers_percent(p);
@@ -928,7 +949,7 @@ void print_help(std::ostream& os)
        "      sets the file directory for train and test data\n"
        "  --data_filedir_train=<string>   --data_filedir_test=<string>\n"
        "  --data_filename_train=<string>  --data_filename_test=<string>\n"
-       "  --index_list_train=<string>     --index_list_test=<string>\n"
+       "  --sample_list_train=<string>    --sample_list_test=<string>\n"
        "  --label_filename_train=<string> --label_filename_test=<string>\n"
        "  --data_reader_percent=<float>\n"
        "  --share_testing_data_readers=<bool:[0|1]>\n"
diff --git a/src/proto/reader.proto b/src/proto/reader.proto
index e06050aacff..5f0fec50ba7 100644
--- a/src/proto/reader.proto
+++ b/src/proto/reader.proto
@@ -44,7 +44,7 @@ message Reader {
   string data_local_filedir = 50; //to support data_store
   string data_filename = 6;
   string label_filename = 7;
-  string index_list = 8;
+  string sample_list = 8;
   double validation_percent = 9;
   int64 absolute_sample_count = 11;
   int64 first_n = 200;
@@ -84,10 +84,12 @@ message Reader {
        // 2 - there's a set of overlap indices that are common to all models
   //------------- end of only for partitioned data sets ------------------
 
-  //------------- start of only for index lists ------------------
-  bool index_list_per_trainer = 400;
-  bool index_list_per_model   = 401;
-  //------------- end of only for index lists ------------------
+  //------------- start of only for sample lists ------------------
+  bool sample_list_per_trainer = 400;
+  bool sample_list_per_model   = 401;
+  // For testing and validation, keep the loaded sample order same as that in the file
+  bool sample_list_keep_order  = 402;
+  //------------- end of only for sample lists ------------------
 
   PythonDataReader python = 501;
 
diff --git a/src/utils/file_utils.cpp b/src/utils/file_utils.cpp
index b2c806f9ac5..4bb9b1bbb31 100644
--- a/src/utils/file_utils.cpp
+++ b/src/utils/file_utils.cpp
@@ -172,7 +172,7 @@ bool create_dir(const std::string dirname) {
 }
 
 /// Load a file into a buffer
-bool load_file(const std::string filename, std::vector<char>& buf) {
+bool load_file(const std::string filename, std::vector<char>& buf, bool append) {
   std::ifstream file(filename, std::ios::binary);
   if (!file.good()) {
     return false;
@@ -181,13 +181,17 @@ bool load_file(const std::string filename, std::vector<char>& buf) {
   file.unsetf(std::ios::skipws);
 
   file.seekg(0, std::ios::end);
-  const std::streampos file_size = file.tellg();
+  const std::streamsize file_size = static_cast<std::streamsize>(file.tellg());
 
   file.seekg(0, std::ios::beg);
 
-  buf.resize(file_size);
+  if (!append) {
+    buf.clear();
+  }
+  const size_t cur_size = buf.size();
+  buf.resize(static_cast<size_t>(file_size) + cur_size);
 
-  file.read(buf.data(), file_size);
+  file.read(buf.data() + cur_size, file_size);
 
   return true;
 }
diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp
index 3c42d6dd461..62937b58638 100644
--- a/src/utils/lbann_library.cpp
+++ b/src/utils/lbann_library.cpp
@@ -194,8 +194,8 @@ std::unique_ptr<trainer> construct_trainer(lbann_comm *comm,
     std::vector<int> data_seq_random_seeds(comm->get_procs_in_world());
     comm->world_all_gather(data_seq_random_seed, data_seq_random_seeds);
 
-    // Update the index lists to accomodate multi-trainer / multi-model specification
-    customize_data_readers_index_list(*comm, pb);
+    // Update the sample lists to accomodate multi-trainer / multi-model specification
+    customize_data_readers_sample_list(*comm, pb);
 
     // Initialize data readers
     //@todo: code not in place for correctly handling image preprocessing

From d1e850ed300962ff43c3434f449c90f56acd986b Mon Sep 17 00:00:00 2001
From: Yosuke Oyama <17844184+oyamay@users.noreply.github.com>
Date: Wed, 23 Sep 2020 14:33:57 +0900
Subject: [PATCH 32/36] Enhance the HDF5 data reader with Distconv (#1580)

* Merge changes from distconv

* Add hdf5_data_reader

* Merge changes in distconv

* Enable slab-based reading when data_reader_hdf5 is used

* Specify data and responses key names in prototexts

* Remove an outdated assertion

* WIP: Support labels in the HDF5 data reader

* Enable users to use either labels or responses in the HDF5 data reader

* Add the label_reconstruction data reader type for segmentation tasks

* Implement hyperslab labels in the HDF5 data reader

* Fix a bug

* Fix a bug

* Support the use_labels option in cross-entropy layers

* Fix a bug of the HDF5 data reader when responses are enabled

* Fix compilation bugs of cross entropy layers

* Support arbitrary data type for HDF5 data readers

* Disable an assertion

* Refactoring

* Use variable length response vectors for the HDF5 data reader

* Remove an unnecessary protobuf field

* Update comments of the HDF5 data reader

* Give a warning for the old CosmoFLow data reader

* Fix a bug of the HDF5 data reader when resposenses are enabled

* Get the max mini-batch size via the trainer in Distconv adapters

* Pass the maximum mini-batch size to models explicitly

* Instantiate base_convolution_adapter

* Fix the label reconstruction mode

* Disable input tensor shuffling in the partitioned HDF5 data reader

* Fix the label_reconstruction mode

* Fix data read offsets and strides when Distconv is enabled

* Do not check whether input tensor shuffle is required in input_distconv_adapter.

This information is already passed via DataReaderMetaData.

* Fix Distconv's conv. and BN layers that use obsoleted LBANN interface.

* Fixed part of a bad merge in the input layer

* Fixed the const-ness of the m_shuffled_required field

* Fix an issue that convolutional layers' weights are referred on Distconv's setup.

m_kernel and m_bias are set on the forward passes.

* Revert the last change in input layers' m_shuffle_required field

Co-authored-by: Naoya Maruyama <maruyama3@llnl.gov>
Co-authored-by: Brian C. Van Essen <vanessen1@llnl.gov>
---
 .../data_coordinator/data_coordinator.hpp     |   9 +-
 .../data_coordinator_metadata.hpp             |   9 +-
 include/lbann/data_readers/data_reader.hpp    |  10 +
 .../lbann/data_readers/data_reader_hdf5.hpp   |  75 ++++++-
 .../io/data_buffers/generic_io_buffer.hpp     |   2 +
 include/lbann/layers/activations/identity.hpp |   2 +-
 .../lbann/layers/activations/leaky_relu.hpp   |   2 +-
 include/lbann/layers/activations/relu.hpp     |   2 +-
 include/lbann/layers/activations/softmax.hpp  |   2 +-
 include/lbann/layers/data_type_layer.hpp      |   2 +-
 include/lbann/layers/io/input/input_layer.hpp |   9 +-
 include/lbann/layers/layer.hpp                |   4 +-
 .../layers/learning/base_convolution.hpp      |   2 +-
 include/lbann/layers/learning/convolution.hpp |   2 +-
 .../lbann/layers/learning/deconvolution.hpp   |   2 +-
 include/lbann/layers/loss/cross_entropy.hpp   |  39 +++-
 .../regularizers/batch_normalization.hpp      |   2 +-
 .../lbann/layers/transform/concatenate.hpp    |   2 +-
 include/lbann/layers/transform/pooling.hpp    |   2 +-
 include/lbann/layers/transform/split.hpp      |   2 +-
 include/lbann/layers/transform/sum.hpp        |   2 +-
 src/data_coordinator/data_coordinator.cpp     |  16 +-
 .../data_coordinator_metadata.cpp             |   2 +
 src/data_readers/data_reader_hdf5.cpp         | 195 +++++++++++++-----
 src/io/data_buffers/partitioned_io_buffer.cpp |   6 +-
 src/layers/data_type_layer.cpp                |   2 +-
 src/layers/io/input/input_layer.cpp           |  27 +--
 src/layers/layer.cpp                          |   6 +-
 src/layers/learning/base_convolution.cpp      |   7 +-
 src/layers/learning/convolution.cpp           |   3 +-
 src/layers/learning/deconvolution.cpp         |   2 +-
 src/proto/factories/layer_factory.cpp         |   6 +-
 src/proto/layers.proto                        |   6 +-
 src/proto/proto_common.cpp                    |  21 +-
 src/proto/reader.proto                        |   9 +
 35 files changed, 366 insertions(+), 125 deletions(-)

diff --git a/include/lbann/data_coordinator/data_coordinator.hpp b/include/lbann/data_coordinator/data_coordinator.hpp
index e4a9ae06c01..3f568b154bd 100644
--- a/include/lbann/data_coordinator/data_coordinator.hpp
+++ b/include/lbann/data_coordinator/data_coordinator.hpp
@@ -35,7 +35,9 @@
 #include <cereal/types/vector.hpp>
 #include <cereal/archives/binary.hpp>
 #include <cereal/archives/xml.hpp>
-
+#ifdef LBANN_HAS_DISTCONV
+#include "lbann/data_readers/data_reader_hdf5.hpp"
+#endif // LBANN_HAS_DISTCONV
 
 namespace lbann {
 
@@ -147,6 +149,7 @@ class data_coordinator {
         map[data_reader_target_mode::CLASSIFICATION] = std::vector<int>(1, dr->get_num_labels());
         map[data_reader_target_mode::REGRESSION] = std::vector<int>(1, dr->get_num_responses());
         map[data_reader_target_mode::RECONSTRUCTION] = dr->get_data_dims();
+        map[data_reader_target_mode::LABEL_RECONSTRUCTION] = dr->get_data_dims();
         map[data_reader_target_mode::NA] = std::vector<int>(1, 0);
         return map;
       }
@@ -182,6 +185,10 @@ class data_coordinator {
     DataReaderMetaData drm;
     drm.data_dims = get_data_dims();
     drm.slice_points = get_slice_points();
+#ifdef LBANN_HAS_DISTCONV
+    const auto training_dr = m_data_readers[execution_mode::training];
+    drm.shuffle_required = training_dr->is_tensor_shuffle_required();
+#endif // LBANN_HAS_DISTCONV
     return drm;
   }
 
diff --git a/include/lbann/data_coordinator/data_coordinator_metadata.hpp b/include/lbann/data_coordinator/data_coordinator_metadata.hpp
index d9c37f23527..8206a4a11a5 100644
--- a/include/lbann/data_coordinator/data_coordinator_metadata.hpp
+++ b/include/lbann/data_coordinator/data_coordinator_metadata.hpp
@@ -30,6 +30,7 @@
 #include <El.hpp>
 
 #include "lbann/utils/enum_iterator.hpp"
+#include "lbann/utils/distconv.hpp"
 
 #include <string>
 #include <unordered_map>
@@ -38,7 +39,7 @@
 namespace lbann {
 
 // NA - Not applicable, used for input layers that don't produce a second output
-enum class data_reader_target_mode {CLASSIFICATION, REGRESSION, RECONSTRUCTION, INPUT, NA};
+enum class data_reader_target_mode {CLASSIFICATION, REGRESSION, RECONSTRUCTION, LABEL_RECONSTRUCTION, INPUT, NA};
 std::string to_string(data_reader_target_mode m);
 /// Map from target modes to dimension maps
 using TargetModeDimMap = std::unordered_map<data_reader_target_mode, std::vector<int>>;
@@ -57,6 +58,12 @@ using slice_points_mode_iterator = enum_iterator<slice_points_mode, slice_points
 struct DataReaderMetaData {
   TargetModeDimMap data_dims;
   SPModeSlicePoints slice_points;
+
+#ifdef LBANN_HAS_DISTCONV
+  // Whether tensor shuffle is required. Some data readers such as
+  // hyperslab-enabled HDF5 data reader does not require shuffling.
+  bool shuffle_required;
+#endif // LBANN_HAS_DISTCONV
 };
 
 } // namespace lbann
diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp
index cddbb491a8c..8cee0724bec 100644
--- a/include/lbann/data_readers/data_reader.hpp
+++ b/include/lbann/data_readers/data_reader.hpp
@@ -38,6 +38,8 @@
 #include "lbann/io/persist.hpp"
 #include "lbann/utils/options.hpp"
 #include "lbann/transforms/transform_pipeline.hpp"
+#include "lbann/utils/distconv.hpp"
+
 #include <cassert>
 #include <algorithm>
 #include <string>
@@ -670,6 +672,14 @@ class generic_data_reader {
     m_transform_pipeline = std::move(tp);
   }
 
+#ifdef LBANN_HAS_DISTCONV
+  /**
+   * Returns whether shuffle (which refers to input data shuffling for
+   * Distconv but not random sample shuffling) is required.
+   */
+  virtual bool is_tensor_shuffle_required() const { return true; }
+#endif // LBANN_HAS_DISTCONV
+
  protected:
 
   bool m_verbose = false;
diff --git a/include/lbann/data_readers/data_reader_hdf5.hpp b/include/lbann/data_readers/data_reader_hdf5.hpp
index e01de13ac7c..1719d542256 100644
--- a/include/lbann/data_readers/data_reader_hdf5.hpp
+++ b/include/lbann/data_readers/data_reader_hdf5.hpp
@@ -19,11 +19,30 @@
 
 namespace lbann {
 /**
- * Data reader for data stored in hdf5 files will need to assume the file contains x
+ * Data reader for data stored in HDF5 files. This data reader was
+ * designed to work with Distconv. This currently has two different
+ * modes:
+ * * Datasets with 3D data and a few numbers of responses:
+ *   This mode assumes a 3D cube dataset such as the CosmoFlow dataset.
+ *   This requires set_has_responses to be called on setup.
+ * * Datasets with 3D data and 3D labels:
+ *   This mode assumes 3D cubes with corresponding 3D label tensors
+ *   such as the LiTS dataset. This requires set_has_labels to be
+ *   called on setup, and label_reconstruction should be used for the
+ *   input layer.
+ *
+ * Each HDF5 file should contain hdf5_key_data, hdf5_key_labels, and
+ * hdf5_key_responses keys to read data, labels and responses
+ * respectively.
  */
+template <typename TensorDataType>
 class hdf5_reader : public generic_data_reader {
  public:
-  hdf5_reader(const bool shuffle);
+  hdf5_reader(const bool shuffle,
+              const std::string key_data,
+              const std::string key_label,
+              const std::string key_responses,
+              const bool hyperslab_labels);
   hdf5_reader(const hdf5_reader&);
   hdf5_reader& operator=(const hdf5_reader&);
   ~hdf5_reader() override {}
@@ -39,36 +58,71 @@ class hdf5_reader : public generic_data_reader {
   void load() override;
   void set_hdf5_paths(const std::vector<std::string> hdf5_paths) {m_file_paths = hdf5_paths;}
 
+  void set_has_labels(const bool b) { m_has_labels = b; }
+  void set_has_responses(const bool b) { m_has_responses = b; }
+  void set_num_responses(const size_t num_responses) {
+    m_all_responses.resize(num_responses);
+  }
+
+  int get_num_labels() const override {
+    if(!m_has_labels) {
+      return generic_data_reader::get_num_labels();
+    }
+    // This data reader currently assumes that the shape of the label
+    // tensor is the same to the data tensor.
+    return m_num_features;
+  }
   int get_num_responses() const override {
+    if(!m_has_responses) {
+      return generic_data_reader::get_num_responses();
+    }
     return get_linearized_response_size();
   }
   int get_linearized_data_size() const override {
     return m_num_features;
   }
+  int get_linearized_label_size() const override {
+    if(!m_has_labels) {
+      return generic_data_reader::get_linearized_label_size();
+    }
+    // This data reader currently assumes that the shape of the label
+    // tensor is the same to the data tensor.
+    return m_num_features;
+  }
   int get_linearized_response_size() const override {
-    return m_num_response_features;
+    if(!m_has_responses) {
+      return generic_data_reader::get_linearized_response_size();
+    }
+    return m_all_responses.size();
   }
   const std::vector<int> get_data_dims() const override {
     return m_data_dims;
   }
+
+#ifdef LBANN_HAS_DISTCONV
+   bool is_tensor_shuffle_required() const override { return false; }
+#endif // LBANN_HAS_DISTCONV
+
  protected:
   void read_hdf5_hyperslab(hsize_t h_data, hsize_t filespace, int rank,
-                           short *sample);
-  void read_hdf5_sample(int data_id, short *sample);
+                           TensorDataType *sample);
+  void read_hdf5_sample(int data_id, TensorDataType *sample, TensorDataType *labels);
   //void set_defaults() override;
   bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
   void fetch_datum_conduit(Mat& X, int data_id);
   bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
   bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override;
   void gather_responses(float *responses);
+  hid_t get_hdf5_data_type() const;
+  conduit::DataType get_conduit_data_type(conduit::index_t num_elements) const;
+
   /// Whether to fetch a label from the last column.
   bool m_has_labels = false;
   /// Whether to fetch a response from the last column.
-  bool m_has_responses = true;
-  int m_image_depth=0;
+  bool m_has_responses = false;
+  int m_image_depth = 0;
   size_t m_num_features;
-  static constexpr int m_num_response_features = 4;
-  float m_all_responses[m_num_response_features];
+  std::vector<float> m_all_responses;
   std::vector<std::string> m_file_paths;
   MPI_Comm m_comm;
   std::vector<int> m_data_dims;
@@ -77,6 +131,9 @@ class hdf5_reader : public generic_data_reader {
   hid_t m_dxpl;
   MPI_Comm m_response_gather_comm;
   bool m_use_data_store;
+  std::string m_key_data, m_key_labels, m_key_responses;
+  bool m_hyperslab_labels;
+
  private:
   static const std::string HDF5_KEY_DATA, HDF5_KEY_LABELS, HDF5_KEY_RESPONSES;
 };
diff --git a/include/lbann/io/data_buffers/generic_io_buffer.hpp b/include/lbann/io/data_buffers/generic_io_buffer.hpp
index eedfccdac68..440742d698f 100644
--- a/include/lbann/io/data_buffers/generic_io_buffer.hpp
+++ b/include/lbann/io/data_buffers/generic_io_buffer.hpp
@@ -55,6 +55,7 @@ class fetch_data_functor {
     case data_reader_target_mode::NA:
        throw lbann_exception("Invalid data reader target mode");
     case data_reader_target_mode::CLASSIFICATION:
+    case data_reader_target_mode::LABEL_RECONSTRUCTION:
     default:
       num_responses_fetched = data_reader->fetch_labels(responses);
     }
@@ -73,6 +74,7 @@ class fetch_data_functor {
     case data_reader_target_mode::REGRESSION:
     case data_reader_target_mode::RECONSTRUCTION:
     case data_reader_target_mode::CLASSIFICATION:
+    case data_reader_target_mode::LABEL_RECONSTRUCTION:
     default:
       throw lbann_exception("Invalid data reader target mode");
     }
diff --git a/include/lbann/layers/activations/identity.hpp b/include/lbann/layers/activations/identity.hpp
index ff59d2138dd..ec73dec5e08 100644
--- a/include/lbann/layers/activations/identity.hpp
+++ b/include/lbann/layers/activations/identity.hpp
@@ -78,7 +78,7 @@ class identity_layer : public data_type_layer<TensorDataType> {
   bool is_distconv_supported() const override {
     return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL;
   }
-  void setup_distconv_adapter() override {
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
     this->get_distconv_adapter_ptr() = make_unique<identity_distconv_adapter<
       TensorDataType, Layout, Device>>(*this);
   }
diff --git a/include/lbann/layers/activations/leaky_relu.hpp b/include/lbann/layers/activations/leaky_relu.hpp
index b936a5ac1b9..20e871fd030 100644
--- a/include/lbann/layers/activations/leaky_relu.hpp
+++ b/include/lbann/layers/activations/leaky_relu.hpp
@@ -96,7 +96,7 @@ class leaky_relu_layer : public data_type_layer<TensorDataType> {
   bool is_distconv_supported() const override {
     return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL;
   }
-  void setup_distconv_adapter() override {
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
     this->get_distconv_adapter_ptr() = make_unique<leaky_relu_distconv_adapter<
       TensorDataType, Layout, Device>>(*this);
   }
diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp
index f95c663ac86..f4d0996bc36 100644
--- a/include/lbann/layers/activations/relu.hpp
+++ b/include/lbann/layers/activations/relu.hpp
@@ -65,7 +65,7 @@ class relu_layer : public data_type_layer<TensorDataType> {
   bool is_distconv_supported() const override {
     return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
   }
-  void setup_distconv_adapter() override {
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
     this->get_distconv_adapter_ptr() = make_unique<relu_distconv_adapter<
       TensorDataType, T_layout, Dev>>(*this);
   }
diff --git a/include/lbann/layers/activations/softmax.hpp b/include/lbann/layers/activations/softmax.hpp
index 0a3a4c9917a..0bf855d914d 100644
--- a/include/lbann/layers/activations/softmax.hpp
+++ b/include/lbann/layers/activations/softmax.hpp
@@ -189,7 +189,7 @@ class softmax_layer : public data_type_layer<TensorDataType> {
   bool is_distconv_supported() const override {
     return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL;
   }
-  void setup_distconv_adapter() override {
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
     this->get_distconv_adapter_ptr() = make_unique<softmax_distconv_adapter<
       TensorDataType, Layout, Device>>(*this);
   }
diff --git a/include/lbann/layers/data_type_layer.hpp b/include/lbann/layers/data_type_layer.hpp
index 2c363ccef21..c79d0e63b32 100644
--- a/include/lbann/layers/data_type_layer.hpp
+++ b/include/lbann/layers/data_type_layer.hpp
@@ -365,7 +365,7 @@ class data_type_layer : public Layer {
   const data_type_distconv_adapter<TensorDataType>& get_distconv_adapter() const override;
 
  protected:
-  void setup_distconv_adapter() override;
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override;
 #endif // LBANN_HAS_DISTCONV
 
 #ifdef LBANN_HAS_CUDA
diff --git a/include/lbann/layers/io/input/input_layer.hpp b/include/lbann/layers/io/input/input_layer.hpp
index 35bdee54a92..87b9fc8f88f 100644
--- a/include/lbann/layers/io/input/input_layer.hpp
+++ b/include/lbann/layers/io/input/input_layer.hpp
@@ -47,7 +47,7 @@ class input_distconv_adapter: public data_type_distconv_adapter<TensorDataType>
   using TensorHost = dc::TensorHost<TensorDataType>;
   using TensorHostShuffler = dc::TensorHostShuffler<TensorDataType>;
 
-  input_distconv_adapter(Layer& layer);
+  input_distconv_adapter(Layer& layer, const bool shuffle_required);
   virtual ~input_distconv_adapter() = default;
 
   TensorHostShuffler &get_shuffler(const TensorHost &src, const TensorHost &dst,
@@ -78,7 +78,7 @@ class input_distconv_adapter: public data_type_distconv_adapter<TensorDataType>
   std::vector<std::unique_ptr<TensorHost>> m_original_host_tensors;
   std::vector<std::unique_ptr<TensorHost>> m_host_tensors;
 
-  bool m_shuffle_required;
+  const bool m_shuffle_required;
   std::vector<std::array<std::unique_ptr<TensorHostShuffler>, 4>> m_shufflers;
   std::unique_ptr<TensorDataType> m_shuffler_src_buf;
   size_t m_shuffler_src_buf_size = 0;
@@ -142,8 +142,9 @@ class input_layer : public generic_input_layer<TensorDataType> {
   bool is_distconv_supported() const override {
     return Dev == El::Device::CPU && T_layout == data_layout::DATA_PARALLEL;
   }
-  void setup_distconv_adapter() override {
-    this->get_distconv_adapter_ptr() = make_unique<distconv_adapter_type>(*this);
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
+    this->get_distconv_adapter_ptr() = make_unique<distconv_adapter_type>(
+        *this, dr_metadata.shuffle_required);
   }
   distconv_adapter_type& get_distconv_adapter() override;
   const distconv_adapter_type& get_distconv_adapter() const override;
diff --git a/include/lbann/layers/layer.hpp b/include/lbann/layers/layer.hpp
index 9778e2f433f..f43ee4c0f76 100644
--- a/include/lbann/layers/layer.hpp
+++ b/include/lbann/layers/layer.hpp
@@ -764,8 +764,8 @@ class Layer {
   /** Indicate whether distconv is supported. */
   virtual bool is_distconv_supported() const { return false; }
   /** Pre-initialize distconv attributes needed for setup_data(). */
-  void prepare_distconv();
-  virtual void setup_distconv_adapter() = 0;
+  void prepare_distconv(const DataReaderMetaData& dr_metadata);
+  virtual void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) = 0;
   std::unique_ptr<distconv_adapter>& get_distconv_adapter_ptr() {
     return m_dc; };
   const std::unique_ptr<distconv_adapter>& get_distconv_adapter_ptr() const {
diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp
index 5f15e935ee0..252314ec3b1 100644
--- a/include/lbann/layers/learning/base_convolution.hpp
+++ b/include/lbann/layers/learning/base_convolution.hpp
@@ -256,7 +256,7 @@ class base_convolution_layer : public data_type_layer<TensorDataType> {
   friend class base_convolution_adapter<TensorDataType, Device>;
  protected:
   using BaseConvAdapterType = base_convolution_adapter<TensorDataType, Device>;
-  void setup_distconv_adapter() override;
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override;
   BaseConvAdapterType& get_distconv_adapter() override;
   const BaseConvAdapterType& get_distconv_adapter() const override;
 #endif // LBANN_HAS_DISTCONV
diff --git a/include/lbann/layers/learning/convolution.hpp b/include/lbann/layers/learning/convolution.hpp
index 19fb2daf248..8980ee32c8e 100644
--- a/include/lbann/layers/learning/convolution.hpp
+++ b/include/lbann/layers/learning/convolution.hpp
@@ -115,7 +115,7 @@ class convolution_layer
 #ifdef LBANN_HAS_DISTCONV
   friend class convolution_distconv_adapter<TensorDataType, Layout, Device>;
  protected:
-  void setup_distconv_adapter() override;
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override;
   bool is_distconv_supported() const override;
 #endif // LBANN_HAS_DISTCONV
 };
diff --git a/include/lbann/layers/learning/deconvolution.hpp b/include/lbann/layers/learning/deconvolution.hpp
index 6ebce704f9b..7977065655a 100644
--- a/include/lbann/layers/learning/deconvolution.hpp
+++ b/include/lbann/layers/learning/deconvolution.hpp
@@ -104,7 +104,7 @@ class deconvolution_layer : public base_convolution_layer<TensorDataType, Device
 #ifdef LBANN_HAS_DISTCONV
   friend class deconvolution_distconv_adapter<TensorDataType, Layout, Device>;
  protected:
-  void setup_distconv_adapter() override;
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override;
   bool is_distconv_supported() const override;
 #endif // LBANN_HAS_DISTCONV
 };
diff --git a/include/lbann/layers/loss/cross_entropy.hpp b/include/lbann/layers/loss/cross_entropy.hpp
index 238bda958a4..0f44d88d04e 100644
--- a/include/lbann/layers/loss/cross_entropy.hpp
+++ b/include/lbann/layers/loss/cross_entropy.hpp
@@ -37,7 +37,9 @@ template <typename TensorDataType, data_layout T_layout, El::Device Dev>
 class cross_entropy_distconv_adapter: public data_type_distconv_adapter<TensorDataType> {
  public:
   using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
-  cross_entropy_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer) {}
+  cross_entropy_distconv_adapter(Layer& layer, bool use_labels)
+      : data_type_distconv_adapter<TensorDataType>(layer),
+        m_use_labels(use_labels){}
   virtual ~cross_entropy_distconv_adapter() = default;
   void setup_distributions(tensor_overlap_constraints &constraints) override;
   dc::Shape get_prev_activations_shape(int index) const override;
@@ -45,6 +47,7 @@ class cross_entropy_distconv_adapter: public data_type_distconv_adapter<TensorDa
   dc::Shape get_activations_local_shape(int index) const override;
   void setup_layer(size_t workspace_capacity) override;
   std::unique_ptr<dc::CrossEntropy> m_cross_entropy;
+  bool m_use_labels;
 };
 #endif // LBANN_HAS_DISTCONV
 
@@ -67,12 +70,14 @@ class cross_entropy_layer : public data_type_layer<TensorDataType> {
 
 public:
 
-  cross_entropy_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {
+  cross_entropy_layer(lbann_comm *comm, bool use_labels)
+      : data_type_layer<TensorDataType>(comm),
+        m_use_labels(use_labels) {
     this->m_expected_num_parent_layers = 2;
   }
 
   cross_entropy_layer(const cross_entropy_layer& other)
-    : data_type_layer<TensorDataType>(other) {
+    : data_type_layer<TensorDataType>(other), m_use_labels(other.m_use_labels) {
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() :
                       nullptr);
@@ -80,6 +85,7 @@ class cross_entropy_layer : public data_type_layer<TensorDataType> {
 
   cross_entropy_layer& operator=(const cross_entropy_layer& other) {
     data_type_layer<TensorDataType>::operator=(other);
+    m_use_labels = other.m_use_labels;
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() :
                       nullptr);
@@ -160,8 +166,16 @@ class cross_entropy_layer : public data_type_layer<TensorDataType> {
     if (this->distconv_enabled()) {
       fp_compute_distconv();
       return;
+    } else {
+      if(m_use_labels) {
+        LBANN_ERROR("Cross-entropy layers without Distconv don't support use_labels.");
+      }
     }
-#endif
+#else // LBANN_HAS_DISTCONV
+    if(m_use_labels) {
+      LBANN_ERROR("Cross-entropy layers without Distconv don't support use_labels.");
+    }
+#endif // LBANN_HAS_DISTCONV
 
     // Initialize workspace
     const auto& prediction = this->get_prev_activations(0);
@@ -182,6 +196,14 @@ class cross_entropy_layer : public data_type_layer<TensorDataType> {
     if (this->distconv_enabled()) {
       bp_compute_distconv();
       return;
+    } else {
+      if(m_use_labels) {
+        LBANN_ERROR("Cross-entropy layers without Distconv don't support use_labels.");
+      }
+    }
+#else // LBANN_HAS_DISTCONV
+    if(m_use_labels) {
+      LBANN_ERROR("Cross-entropy layers without Distconv don't support use_labels.");
     }
 #endif // LBANN_HAS_DISTCONV
 
@@ -201,6 +223,9 @@ class cross_entropy_layer : public data_type_layer<TensorDataType> {
   /** Compute local gradients. */
   void local_bp_compute();
 
+  /** Use interger label tensors as ground-truth. */
+  bool m_use_labels;
+
   /** Workspace matrix. */
   std::unique_ptr<AbsDistMatrixType> m_workspace;
 
@@ -211,9 +236,9 @@ class cross_entropy_layer : public data_type_layer<TensorDataType> {
     return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
   }
 
-  void setup_distconv_adapter() override {
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
     this->get_distconv_adapter_ptr() = make_unique<
-      cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>>(*this);
+      cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>>(*this, m_use_labels);
   }
 
   cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() override;
@@ -323,7 +348,7 @@ setup_distributions(tensor_overlap_constraints &constraints) {
 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
 void cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::setup_layer(
     size_t workspace_capacity) {
-  m_cross_entropy = make_unique<dc::CrossEntropy>(dc::get_backend());
+  m_cross_entropy = make_unique<dc::CrossEntropy>(dc::get_backend(), m_use_labels);
   m_cross_entropy->setup(this->get_prev_activations(0),
                          this->get_prev_activations(1),
                          this->get_activations(0));
diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp
index 4c5a3013eed..c389567ca98 100644
--- a/include/lbann/layers/regularizers/batch_normalization.hpp
+++ b/include/lbann/layers/regularizers/batch_normalization.hpp
@@ -386,7 +386,7 @@ class batch_normalization_layer : public regularizer_layer<TensorDataType> {
   bool is_distconv_supported() const override {
     return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
   }
-  void setup_distconv_adapter() override {
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
     this->get_distconv_adapter_ptr() = make_unique<
       batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>>(*this);
   }
diff --git a/include/lbann/layers/transform/concatenate.hpp b/include/lbann/layers/transform/concatenate.hpp
index ae71499acd4..f7306baa9cd 100644
--- a/include/lbann/layers/transform/concatenate.hpp
+++ b/include/lbann/layers/transform/concatenate.hpp
@@ -114,7 +114,7 @@ class concatenate_layer : public data_type_layer<TensorDataType> {
     return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL
         && m_concat_dim == 0;
   }
-  void setup_distconv_adapter() override {
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
     this->get_distconv_adapter_ptr() = make_unique<
       concatenate_distconv_adapter<TensorDataType, Layout, Device>>(*this);
   }
diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp
index 35db88a633c..176fc26abd0 100644
--- a/include/lbann/layers/transform/pooling.hpp
+++ b/include/lbann/layers/transform/pooling.hpp
@@ -542,7 +542,7 @@ class pooling_layer : public transform_layer<TensorDataType> {
   friend class pooling_distconv_adapter<TensorDataType, T_layout, Dev>;
  protected:
   bool is_distconv_supported() const override;
-  void setup_distconv_adapter() override {
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
     this->get_distconv_adapter_ptr() = make_unique<
       pooling_distconv_adapter<TensorDataType, T_layout, Dev>>(*this);
   }
diff --git a/include/lbann/layers/transform/split.hpp b/include/lbann/layers/transform/split.hpp
index 90248ce99a8..5cdeedf681f 100644
--- a/include/lbann/layers/transform/split.hpp
+++ b/include/lbann/layers/transform/split.hpp
@@ -106,7 +106,7 @@ class split_layer : public transform_layer<TensorDataType> {
   bool is_distconv_supported() const override {
     return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
   }
-  void setup_distconv_adapter() override {
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
     this->get_distconv_adapter_ptr() = make_unique<split_distconv_adapter<
       TensorDataType, T_layout, Dev>>(*this);
   }
diff --git a/include/lbann/layers/transform/sum.hpp b/include/lbann/layers/transform/sum.hpp
index 7786f72f634..6d7c11884ff 100644
--- a/include/lbann/layers/transform/sum.hpp
+++ b/include/lbann/layers/transform/sum.hpp
@@ -129,7 +129,7 @@ class sum_layer : public transform_layer<TensorDataType> {
   bool is_distconv_supported() const override {
     return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
   }
-  void setup_distconv_adapter() override {
+  void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override {
     this->get_distconv_adapter_ptr() = make_unique<sum_distconv_adapter<TensorDataType, T_layout, Dev>>(*this);
   }
   sum_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() override;
diff --git a/src/data_coordinator/data_coordinator.cpp b/src/data_coordinator/data_coordinator.cpp
index 13c3178646e..c1e63be17fd 100644
--- a/src/data_coordinator/data_coordinator.cpp
+++ b/src/data_coordinator/data_coordinator.cpp
@@ -26,6 +26,7 @@
 
 #include <lbann/data_coordinator/data_coordinator.hpp>
 #include <lbann/trainers/trainer.hpp>
+#include <lbann/utils/distconv.hpp>
 
 namespace lbann {
 
@@ -94,13 +95,27 @@ void data_coordinator::calculate_num_iterations_per_epoch(int max_mini_batch_siz
       " :: generic_data_distribution: number of parallel readers is zero");
   }
 
+#ifdef LBANN_HAS_DISTCONV
+  if (dc::is_cosmoflow_parallel_io_enabled()) {
+    // #trainers is assumed to be 1.
+    assert_eq(this->m_comm->get_num_trainers(), 1);
+  }
+#endif
+
   /// Set the basic parameters for stride and offset of the data reader
   int batch_stride = max_mini_batch_size;
   int base_offset  = this->m_comm->get_rank_in_trainer();
+#ifdef LBANN_HAS_DISTCONV
+  base_offset  = dc::get_input_rank(*(this->m_comm)) / dc::get_number_of_io_partitions();
+#endif
   /// Set mini-batch size and stride
   data_reader->set_mini_batch_size(max_mini_batch_size);
   data_reader->set_stride_to_next_mini_batch(batch_stride);
+#ifdef LBANN_HAS_DISTCONV
+  data_reader->set_sample_stride(num_parallel_readers_per_model / dc::get_number_of_io_partitions());
+#else
   data_reader->set_sample_stride(num_parallel_readers_per_model);
+#endif
   data_reader->set_iteration_stride(1);
   /// Set data reader base offset and model offset
   data_reader->set_base_offset(base_offset);
@@ -116,7 +131,6 @@ void data_coordinator::calculate_num_iterations_per_epoch(int max_mini_batch_siz
   data_reader->set_num_iterations_per_epoch(num_iterations_per_epoch);
   data_reader->set_last_mini_batch_size(last_mini_batch_size);
   data_reader->set_stride_to_last_mini_batch(data_reader->get_stride_to_next_mini_batch());
-
   data_reader->set_global_mini_batch_size(max_mini_batch_size);
   data_reader->set_global_last_mini_batch_size(last_mini_batch_size);
   return;
diff --git a/src/data_coordinator/data_coordinator_metadata.cpp b/src/data_coordinator/data_coordinator_metadata.cpp
index 0189e8f8ae1..45f699a2ce6 100644
--- a/src/data_coordinator/data_coordinator_metadata.cpp
+++ b/src/data_coordinator/data_coordinator_metadata.cpp
@@ -37,6 +37,8 @@ std::string to_string(const data_reader_target_mode m) {
     return "regression";
   case data_reader_target_mode::RECONSTRUCTION:
     return "reconstruction";
+  case data_reader_target_mode::LABEL_RECONSTRUCTION:
+    return "label_reconstruction";
   case data_reader_target_mode::INPUT:
     return "input";
   case data_reader_target_mode::NA:
diff --git a/src/data_readers/data_reader_hdf5.cpp b/src/data_readers/data_reader_hdf5.cpp
index 722b1b79e42..b7a180602af 100644
--- a/src/data_readers/data_reader_hdf5.cpp
+++ b/src/data_readers/data_reader_hdf5.cpp
@@ -52,19 +52,27 @@ inline hid_t check_hdf5(hid_t hid, const char *file, int line) {
 
 namespace lbann {
 
-const std::string hdf5_reader::HDF5_KEY_DATA = "full";
-const std::string hdf5_reader::HDF5_KEY_RESPONSES = "unitPar";
-
-hdf5_reader::hdf5_reader(const bool shuffle)
+template <typename TensorDataType>
+hdf5_reader<TensorDataType>::hdf5_reader(const bool shuffle,
+                         const std::string key_data,
+                         const std::string key_labels,
+                         const std::string key_responses,
+                         const bool hyperslab_labels)
     : generic_data_reader(shuffle),
-      m_use_data_store(options::get()->get_bool("use_data_store")) {
+      m_use_data_store(options::get()->get_bool("use_data_store")),
+      m_key_data(key_data),
+      m_key_labels(key_labels),
+      m_key_responses(key_responses),
+      m_hyperslab_labels(hyperslab_labels) {
 }
 
-hdf5_reader::hdf5_reader(const hdf5_reader& rhs)  : generic_data_reader(rhs) {
+template <typename TensorDataType>
+hdf5_reader<TensorDataType>::hdf5_reader(const hdf5_reader& rhs)  : generic_data_reader(rhs) {
   copy_members(rhs);
 }
 
-hdf5_reader& hdf5_reader::operator=(const hdf5_reader& rhs) {
+template <typename TensorDataType>
+hdf5_reader<TensorDataType>& hdf5_reader<TensorDataType>::operator=(const hdf5_reader<TensorDataType>& rhs) {
   // check for self-assignment
   if (this == &rhs) {
     return (*this);
@@ -74,7 +82,8 @@ hdf5_reader& hdf5_reader::operator=(const hdf5_reader& rhs) {
   return (*this);
 }
 
-void hdf5_reader::copy_members(const hdf5_reader &rhs) {
+template <typename TensorDataType>
+void hdf5_reader<TensorDataType>::copy_members(const hdf5_reader &rhs) {
   if(rhs.m_data_store != nullptr) {
     m_data_store = new data_store_conduit(rhs.get_data_store());
   }
@@ -88,14 +97,16 @@ void hdf5_reader::copy_members(const hdf5_reader &rhs) {
   m_comm = rhs.m_comm;
   m_file_paths = rhs.m_file_paths;
   m_use_data_store = rhs.m_use_data_store;
-
-  for(size_t i = 0; i < m_num_response_features; i++) {
-    m_all_responses[i] = rhs.m_all_responses[i];
-  }
+  m_key_data = rhs.m_key_data;
+  m_key_labels = rhs.m_key_labels;
+  m_key_responses = rhs.m_key_responses;
+  m_hyperslab_labels = rhs.m_hyperslab_labels;
+  m_all_responses = rhs.m_all_responses;
 }
 
-void hdf5_reader::read_hdf5_hyperslab(hsize_t h_data, hsize_t filespace,
-                                      int rank, short *sample) {
+template <typename TensorDataType>
+void hdf5_reader<TensorDataType>::read_hdf5_hyperslab(hsize_t h_data, hsize_t filespace,
+                                      int rank, TensorDataType *sample) {
   prof_region_begin("read_hdf5_hyperslab", prof_colors[0], false);
   // this is the splits, right now it is hard coded to split along the
   // z axis
@@ -120,19 +131,21 @@ void hdf5_reader::read_hdf5_hyperslab(hsize_t h_data, hsize_t filespace,
                                  offset, NULL, count,
                                  m_hyperslab_dims.data()));
 
-  CHECK_HDF5(H5Dread(h_data, H5T_NATIVE_SHORT, memspace,
+  CHECK_HDF5(H5Dread(h_data, get_hdf5_data_type(), memspace,
                      filespace, m_dxpl, sample));
   prof_region_end("read_hdf5_hyperslab", false);
 }
 
-void hdf5_reader::read_hdf5_sample(int data_id, short *sample) {
+template <typename TensorDataType>
+void hdf5_reader<TensorDataType>::read_hdf5_sample(int data_id, TensorDataType *sample,
+                                   TensorDataType *labels) {
   int world_rank = get_comm()->get_rank_in_trainer();
   auto file = m_file_paths[data_id];
   hid_t h_file = CHECK_HDF5(H5Fopen(file.c_str(), H5F_ACC_RDONLY, m_fapl));
 
   // load in dataset
   hid_t h_data = CHECK_HDF5(
-      H5Dopen(h_file, HDF5_KEY_DATA.c_str(), H5P_DEFAULT));
+      H5Dopen(h_file, m_key_data.c_str(), H5P_DEFAULT));
   hid_t filespace = CHECK_HDF5(H5Dget_space(h_data));
   //get the number of dimesnionse from the dataset
   int rank1 = H5Sget_simple_extent_ndims(filespace);
@@ -144,16 +157,24 @@ void hdf5_reader::read_hdf5_sample(int data_id, short *sample) {
   //close data set
   CHECK_HDF5(H5Dclose(h_data));
 
-  if (m_has_responses) {
-    h_data = CHECK_HDF5(H5Dopen(h_file, HDF5_KEY_RESPONSES.c_str(), H5P_DEFAULT));
-    CHECK_HDF5(H5Dread(h_data, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, m_all_responses));
+  if (m_has_labels && labels != nullptr) {
+    assert_always(m_hyperslab_labels);
+    hid_t h_labels = CHECK_HDF5(H5Dopen(h_file, m_key_labels.c_str(), H5P_DEFAULT));
+    hid_t filespace_labels = CHECK_HDF5(H5Dget_space(h_labels));
+    read_hdf5_hyperslab(h_labels, filespace_labels, world_rank, labels);
+    CHECK_HDF5(H5Dclose(h_labels));
+  } else if (m_has_responses) {
+    assert_always(labels == nullptr);
+    h_data = CHECK_HDF5(H5Dopen(h_file, m_key_responses.c_str(), H5P_DEFAULT));
+    CHECK_HDF5(H5Dread(h_data, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, &m_all_responses[0]));
     CHECK_HDF5(H5Dclose(h_data));
   }
   CHECK_HDF5(H5Fclose(h_file));
   return;
 }
 
-void hdf5_reader::load() {
+template <typename TensorDataType>
+void hdf5_reader<TensorDataType>::load() {
   lbann_comm* l_comm = get_comm();
   MPI_Comm mpi_comm = l_comm->get_trainer_comm().GetMPIComm();
   int world_rank = l_comm->get_rank_in_trainer();
@@ -173,7 +194,7 @@ void hdf5_reader::load() {
   if (m_file_paths.size() > 0) {
     const hid_t h_file = CHECK_HDF5(H5Fopen(m_file_paths[0].c_str(),
                                             H5F_ACC_RDONLY, H5P_DEFAULT));
-    const hid_t h_data = CHECK_HDF5(H5Dopen(h_file, HDF5_KEY_DATA.c_str(),
+    const hid_t h_data = CHECK_HDF5(H5Dopen(h_file, m_key_data.c_str(),
                                             H5P_DEFAULT));
     const hid_t h_space = CHECK_HDF5(H5Dget_space(h_data));
     if (CHECK_HDF5(H5Sget_simple_extent_ndims(h_space)) != 4) {
@@ -192,7 +213,6 @@ void hdf5_reader::load() {
                                    (size_t) 1,
                                    std::multiplies<size_t>());
 
-
   for (auto i: m_data_dims) {
     m_hyperslab_dims.push_back(i);
   }
@@ -222,30 +242,50 @@ void hdf5_reader::load() {
   MPI_Comm_dup(dc::get_mpi_comm(), &m_response_gather_comm);
 }
 
-bool hdf5_reader::fetch_label(Mat& Y, int data_id, int mb_idx) {
+template <typename TensorDataType>
+bool hdf5_reader<TensorDataType>::fetch_label(Mat& Y, int data_id, int mb_idx) {
+  if(!m_has_labels) {
+    return generic_data_reader::fetch_label(Y, data_id, mb_idx);
+  }
+
+  prof_region_begin("fetch_label", prof_colors[0], false);
+  assert_always(m_hyperslab_labels);
+  assert_always(m_use_data_store);
+  TensorDataType *buf = nullptr;
+  assert_eq(Y.Height(), m_num_features);
+  conduit::Node node;
+  const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
+  node.set_external(ds_node);
+  const std::string conduit_obj = LBANN_DATA_ID_STR(data_id);
+  buf = node[conduit_obj+"/labels_slab"].value();
+  std::memcpy(Y.Buffer(), buf, m_num_features/dc::get_number_of_io_partitions()*sizeof(TensorDataType));
+  prof_region_end("fetch_label", false);
   return true;
 }
 
-bool hdf5_reader::fetch_datum(Mat& X, int data_id, int mb_idx) {
+template <typename TensorDataType>
+bool hdf5_reader<TensorDataType>::fetch_datum(Mat& X, int data_id, int mb_idx) {
   prof_region_begin("fetch_datum", prof_colors[0], false);
 
   // In the Cosmoflow case, each minibatch should have only one
   // sample per rank.
   assert_eq(X.Width(), 1);
+  assert_eq(sizeof(DataType)%sizeof(TensorDataType), 0);
   assert_eq(X.Height(),
             m_num_features / dc::get_number_of_io_partitions()
-            / (sizeof(DataType) / sizeof(short)));
+            / (sizeof(DataType) / sizeof(TensorDataType)));
 
   if (m_use_data_store) {
     fetch_datum_conduit(X, data_id);
   } else {
-    read_hdf5_sample(data_id, (short*)X.Buffer());
+    read_hdf5_sample(data_id, (TensorDataType*)X.Buffer(), nullptr);
   }
   prof_region_end("fetch_datum", false);
   return true;
 }
 
-void hdf5_reader::fetch_datum_conduit(Mat& X, int data_id) {
+template <typename TensorDataType>
+void hdf5_reader<TensorDataType>::fetch_datum_conduit(Mat& X, int data_id) {
   const std::string conduit_key = LBANN_DATA_ID_STR(data_id);
   // Create a node to hold all of the data
   conduit::Node node;
@@ -256,11 +296,24 @@ void hdf5_reader::fetch_datum_conduit(Mat& X, int data_id) {
     prof_region_end("get_conduit_node", false);
   } else {
     auto &conduit_obj = node[conduit_key + "/slab"];
-    conduit_obj.set(conduit::DataType::int16(
+    conduit_obj.set(get_conduit_data_type(
         m_num_features / dc::get_number_of_io_partitions()));
-    short *sample_buf = conduit_obj.value();
-    read_hdf5_sample(data_id, sample_buf);
-    node[conduit_key + "/responses"].set(m_all_responses, 4);
+    TensorDataType *sample_buf = conduit_obj.value();
+    if(m_has_labels) {
+      assert_always(m_hyperslab_labels);
+      auto &conduit_labels_obj = node[conduit_key + "/labels_slab"];
+      conduit_labels_obj.set(get_conduit_data_type(
+          m_num_features / dc::get_number_of_io_partitions()));
+      TensorDataType *labels_buf = conduit_labels_obj.value();
+      read_hdf5_sample(data_id, sample_buf, labels_buf);
+    } else {
+      read_hdf5_sample(data_id, sample_buf, nullptr);
+    }
+    if(m_has_responses) {
+      node[conduit_key + "/responses"].set(
+          &m_all_responses[0],
+          m_all_responses.size());
+    }
     if (priming_data_store()) {
       // Once the node has been populated save it in the data store
       m_data_store->set_conduit_node(data_id, node);
@@ -270,29 +323,49 @@ void hdf5_reader::fetch_datum_conduit(Mat& X, int data_id) {
   conduit::Node slab;
   slab.set_external(node[conduit_key + "/slab"]);
   prof_region_end("set_external", false);
-  short *data = slab.value();
+  TensorDataType *data = slab.value();
   prof_region_begin("copy_to_buffer", prof_colors[0], false);
   std::memcpy(X.Buffer(), data, slab.dtype().number_of_elements()*slab.dtype().element_bytes());
   prof_region_end("copy_to_buffer", false);
 }
 
 //get from a cached response
-bool hdf5_reader::fetch_response(Mat& Y, int data_id, int mb_idx) {
+template <typename TensorDataType>
+bool hdf5_reader<TensorDataType>::fetch_response(Mat& Y, int data_id, int mb_idx) {
+  if(!m_has_responses) {
+    return generic_data_reader::fetch_response(Y, data_id, mb_idx);
+  }
+
   prof_region_begin("fetch_response", prof_colors[0], false);
-  assert_eq(Y.Height(), m_num_response_features);
   float *buf = nullptr;
-  if (data_store_active()) {
+  if(m_hyperslab_labels) {
+    assert_eq(Y.Height(), m_num_features);
+    const std::string conduit_key = LBANN_DATA_ID_STR(data_id);
     conduit::Node node;
     const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
     node.set_external(ds_node);
-    const std::string conduit_obj = LBANN_DATA_ID_STR(data_id);
-    buf = node[conduit_obj+"/responses"].value();
-  }else {
-    buf = m_all_responses;
+    conduit::Node slab;
+    slab.set_external(node[conduit_key + "/responses_slab"]);
+    prof_region_end("set_external", false);
+    buf = slab.value();
+    std::memcpy(Y.Buffer(), buf, m_num_features*sizeof(TensorDataType));
+  } else {
+    assert_eq(Y.Height(), m_all_responses.size());
+    if (data_store_active()) {
+      conduit::Node node;
+      const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
+      node.set_external(ds_node);
+      const std::string conduit_obj = LBANN_DATA_ID_STR(data_id);
+      buf = node[conduit_obj+"/responses"].value();
+    }else {
+      buf = &m_all_responses[0];
+    }
+    std::memcpy(Y.Buffer(), buf,
+                m_all_responses.size()*sizeof(DataType));
+    if (dc::get_rank_stride() == 1) {
+      gather_responses(Y.Buffer());
+    }
   }
-  std::memcpy(Y.Buffer(), buf,
-              m_num_response_features*sizeof(DataType));
-  gather_responses(Y.Buffer());
   prof_region_end("fetch_response", false);
   return true;
 }
@@ -300,8 +373,9 @@ bool hdf5_reader::fetch_response(Mat& Y, int data_id, int mb_idx) {
 // Gather scattered responses to the first N ranks, where N is the
 // mini-batch size. This is not necessary when the rank reordering
 // is used.
-void hdf5_reader::gather_responses(float *responses) {
-  float recv_buf[m_num_response_features];
+template <typename TensorDataType>
+void hdf5_reader<TensorDataType>::gather_responses(float *responses) {
+  float recv_buf[m_all_responses.size()];
   const int rank = dc::get_mpi_rank();
   const int num_part = dc::get_number_of_io_partitions();
   const int mini_batch_size = this->get_loaded_mini_batch_size();
@@ -313,14 +387,14 @@ void hdf5_reader::gather_responses(float *responses) {
 
   // send
   if (rank % num_part == 0) {
-    MPI_Isend(responses, m_num_response_features, MPI_FLOAT, dst_rank,
+    MPI_Isend(responses, m_all_responses.size(), MPI_FLOAT, dst_rank,
               tag, m_response_gather_comm, &req[req_idx]);
     ++req_idx;
   }
 
   // recv
   if (rank < mini_batch_size) {
-    MPI_Irecv(recv_buf, m_num_response_features, MPI_FLOAT, src_rank, tag,
+    MPI_Irecv(recv_buf, m_all_responses.size(), MPI_FLOAT, src_rank, tag,
               m_response_gather_comm, &req[req_idx]);
     ++req_idx;
   }
@@ -329,7 +403,32 @@ void hdf5_reader::gather_responses(float *responses) {
     MPI_Waitall(req_idx, req, MPI_STATUS_IGNORE);
   }
 
-  std::memcpy(responses, recv_buf, sizeof(float) * m_num_response_features);
+  std::memcpy(responses, recv_buf, sizeof(float) * m_all_responses.size());
+}
+
+template<> hid_t hdf5_reader<float>::get_hdf5_data_type() const {
+  return H5T_NATIVE_FLOAT;
+}
+template<> hid_t hdf5_reader<double>::get_hdf5_data_type() const {
+  return H5T_NATIVE_DOUBLE;
+}
+template<> hid_t hdf5_reader<short>::get_hdf5_data_type() const {
+  return H5T_NATIVE_SHORT;
+}
+
+template<> conduit::DataType hdf5_reader<float>::get_conduit_data_type(conduit::index_t num_elements) const {
+  return conduit::DataType::float32(num_elements);
+}
+template<> conduit::DataType hdf5_reader<double>::get_conduit_data_type(conduit::index_t num_elements) const {
+  return conduit::DataType::float64(num_elements);
 }
+template<> conduit::DataType hdf5_reader<short>::get_conduit_data_type(conduit::index_t num_elements) const {
+  return conduit::DataType::int16(num_elements);
+}
+
+// TODO (oyamay): Instantiate hdf5_reader<short> for large samples
+#define PROTO(T) template class hdf5_reader<T>;
+
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/io/data_buffers/partitioned_io_buffer.cpp b/src/io/data_buffers/partitioned_io_buffer.cpp
index 8609adf4558..5d51747f9ea 100644
--- a/src/io/data_buffers/partitioned_io_buffer.cpp
+++ b/src/io/data_buffers/partitioned_io_buffer.cpp
@@ -84,9 +84,11 @@ void partitioned_io_buffer<TensorDataType>::setup_data(El::Int num_neurons, El::
 #ifdef LBANN_HAS_DISTCONV
   if (dc::is_cosmoflow_parallel_io_enabled()) {
     num_neurons /= dc::get_number_of_io_partitions();
+    // TODO: Make sure that TensorDatType is equivalent to the HDF5
+    // data reader's data type (float as default).
     // TensorDataType is assumed to be 2-byte integer types such as
-    // short or int16_t.
-    assert_eq(sizeof(TensorDataType), sizeof(short));
+    // short or int16_t in an older version.
+    // assert_eq(sizeof(TensorDataType), sizeof(short));
     max_mini_batch_size *= dc::get_number_of_io_partitions();
   }
 #endif // LBANN_HAS_DISTCONV
diff --git a/src/layers/data_type_layer.cpp b/src/layers/data_type_layer.cpp
index 0f34fc53a00..116bd4e8921 100644
--- a/src/layers/data_type_layer.cpp
+++ b/src/layers/data_type_layer.cpp
@@ -860,7 +860,7 @@ void data_type_layer<TensorDataType>::bp_setup_gradient_wrt_inputs(
 
 #ifdef LBANN_HAS_DISTCONV
 template <typename TensorDataType>
-void data_type_layer<TensorDataType>::setup_distconv_adapter() {
+void data_type_layer<TensorDataType>::setup_distconv_adapter(const DataReaderMetaData& dr_metadata) {
   this->get_distconv_adapter_ptr() = make_unique<data_type_distconv_adapter<TensorDataType>>(*this);
 }
 
diff --git a/src/layers/io/input/input_layer.cpp b/src/layers/io/input/input_layer.cpp
index f2e3fe3bda5..b1ffad6f4d5 100644
--- a/src/layers/io/input/input_layer.cpp
+++ b/src/layers/io/input/input_layer.cpp
@@ -27,9 +27,6 @@
 #define LBANN_INPUT_LAYER_INSTANTIATE
 #include "lbann/layers/io/input/input_layer.hpp"
 #include "lbann/utils/profiling.hpp"
-#ifdef LBANN_HAS_DISTCONV
-#include "lbann/data_readers/data_reader_hdf5.hpp"
-#endif // LBANN_HAS_DISTCONV
 
 namespace lbann {
 
@@ -37,25 +34,14 @@ namespace lbann {
 template <typename TensorDataType, typename T_io_buffer,
           data_layout T_layout, El::Device Dev>
 input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::
-input_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer),
-                                      m_shuffle_required(true) {
+input_distconv_adapter(Layer& layer, const bool shuffle_required)
+    : data_type_distconv_adapter<TensorDataType>(layer),
+      m_shuffle_required(shuffle_required) {
   // Input data is only processed when its consumer layer is also
   // enabled for distconv
   for (int i = 0; i < layer.get_num_children(); ++i) {
     m_is_input_processed.push_back(layer.get_child_layers()[i]->distconv_enabled());
   }
-  auto &l = dynamic_cast<input_layer<
-    TensorDataType, T_io_buffer, T_layout, Dev>&>(this->layer());
-  // TODO: hdf5_reader is assumed to return a sub-sample partitioned
-  // in the same way as specified by the parallel strategy of this input
-  // layer. Other data readers are assumed to return a complete
-  // sample, thus shuffling is required (unless sample-parallel
-  // strategy is given). Conceptually, it seems to make sense if a
-  // data reader is annotated with a parallel strategy. Note that,
-  // when the HDF5 data reader is used, it is assumed that it is used
-  // in all execution modes.
-  auto training_dr = l.get_data_reader(execution_mode::training);
-  m_shuffle_required = dynamic_cast<hdf5_reader*>(training_dr) == nullptr;
   if (m_shuffle_required) {
     m_shufflers.resize(layer.get_num_children());
   }
@@ -277,8 +263,11 @@ void input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::fp_comp
   for (int mat_idx = 0; mat_idx < l.get_num_children(); ++mat_idx) {
     if (!is_input_processed(mat_idx)) continue;
 
-    assert_eq(mb_size * dc::get_number_of_io_partitions(),
-              l.get_activations(mat_idx).Width());
+    // TODO: This is diabled as it raises an error when the HDF5 data
+    // reader with hyperslab labels is used. Remove this assertion or
+    // reshape the actiavtion tensor (mat_idx=1).
+    // assert_eq(mb_size * dc::get_number_of_io_partitions(),
+    //           l.get_activations(mat_idx).Width());
 
     auto &original_tensor = *m_original_host_tensors[mat_idx];
     auto &host_tensor = *m_host_tensors[mat_idx];
diff --git a/src/layers/layer.cpp b/src/layers/layer.cpp
index c7eb4351843..c409c461e85 100644
--- a/src/layers/layer.cpp
+++ b/src/layers/layer.cpp
@@ -401,7 +401,7 @@ void Layer::setup(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata) {
   setup_dims(dr_metadata);
   setup_matrices(m_comm->get_trainer_grid());
 #ifdef LBANN_HAS_DISTCONV
-  prepare_distconv();
+  prepare_distconv(dr_metadata);
 #endif // LBANN_HAS_DISTCONV
   setup_data(max_mini_batch_size);
   if (using_gpus()) { setup_gpu(); }
@@ -646,9 +646,9 @@ void Layer::set_layer_pointers(std::vector<Layer*> layers) {
 }
 
 #ifdef LBANN_HAS_DISTCONV
-void Layer::prepare_distconv() {
+void Layer::prepare_distconv(const DataReaderMetaData& dr_metadata) {
   if (distconv_enabled()) {
-    setup_distconv_adapter();
+    setup_distconv_adapter(dr_metadata);
   }
 }
 
diff --git a/src/layers/learning/base_convolution.cpp b/src/layers/learning/base_convolution.cpp
index c9806f2fcc8..3faa3f5817d 100644
--- a/src/layers/learning/base_convolution.cpp
+++ b/src/layers/learning/base_convolution.cpp
@@ -1215,7 +1215,8 @@ base_convolution_layer<TensorDataType,Device>::get_backward_filter_algo_cudnn(
 
 #ifdef LBANN_HAS_DISTCONV
 template <typename TensorDataType, El::Device Device>
-void base_convolution_layer<TensorDataType,Device>::setup_distconv_adapter() {
+void base_convolution_layer<TensorDataType,Device>::setup_distconv_adapter(
+    const DataReaderMetaData& dr_metadata) {
   this->get_distconv_adapter_ptr() = make_unique<
     base_convolution_adapter<TensorDataType, Device>>(*this);
 }
@@ -1254,15 +1255,11 @@ void base_convolution_adapter<TensorDataType, Device>::setup_fp_tensors() {
   std::reverse(kernel_shape.begin(), kernel_shape.end());
   const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
   m_kernel = make_unique<TensorDevType>(kernel_shape, loc, shared_dist);
-  assert0(dc::tensor::View(
-            *m_kernel, layer.weights_values(0).LockedBuffer()));
 
   if (layer.m_bias_scaling_factor != TensorDataType(0)) {
     dc::Shape bias_shape(dc::get_num_dims(layer), 1);
     bias_shape[dc::get_channel_dim()] = layer.get_output_dims()[0];
     m_bias = make_unique<TensorDevType>(bias_shape, loc, shared_dist);
-    assert0(dc::tensor::View(
-              *m_bias, layer.weights_values(1).LockedBuffer()));
   }
 }
 
diff --git a/src/layers/learning/convolution.cpp b/src/layers/learning/convolution.cpp
index e9ea59a2b58..ea7723cafbc 100644
--- a/src/layers/learning/convolution.cpp
+++ b/src/layers/learning/convolution.cpp
@@ -162,7 +162,8 @@ void convolution_layer<TensorDataType,Layout,Device>::bp_compute() {
 
 #if defined LBANN_HAS_DISTCONV
 template <typename TensorDataType, data_layout Layout, El::Device Device>
-void convolution_layer<TensorDataType,Layout,Device>::setup_distconv_adapter() {
+void convolution_layer<TensorDataType,Layout,Device>::setup_distconv_adapter(
+    const DataReaderMetaData& dr_metadata) {
   this->get_distconv_adapter_ptr() = make_unique<
     convolution_distconv_adapter<TensorDataType, Layout, Device>>(*this);
 }
diff --git a/src/layers/learning/deconvolution.cpp b/src/layers/learning/deconvolution.cpp
index 3f80353e369..79c1e3e1e58 100644
--- a/src/layers/learning/deconvolution.cpp
+++ b/src/layers/learning/deconvolution.cpp
@@ -185,7 +185,7 @@ void deconvolution_layer<TensorDataType,Layout,Device>::bp_compute() {
 #if defined LBANN_HAS_DISTCONV
 template <typename TensorDataType, data_layout Layout, El::Device Device>
 void deconvolution_layer<TensorDataType,Layout,Device>
-::setup_distconv_adapter() {
+::setup_distconv_adapter(const DataReaderMetaData& dr_metadata) {
   this->get_distconv_adapter_ptr() = make_unique<
     deconvolution_distconv_adapter<TensorDataType, Layout, Device>>(*this);
 }
diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp
index 6a1ef565820..d7c1ad3085d 100644
--- a/src/proto/factories/layer_factory.cpp
+++ b/src/proto/factories/layer_factory.cpp
@@ -252,7 +252,6 @@ class factory_manager
     LBANN_REGISTER_DEFAULT_BUILDER(BooleanFalseNegative, boolean_false_negative);
     LBANN_REGISTER_DEFAULT_BUILDER(BooleanFalsePositive, boolean_false_positive);
     LBANN_REGISTER_DEFAULT_BUILDER(CategoricalAccuracy, categorical_accuracy);
-    LBANN_REGISTER_DEFAULT_BUILDER(CrossEntropy, cross_entropy);
     LBANN_REGISTER_DEFAULT_BUILDER(L1Norm, l1_norm);
     LBANN_REGISTER_DEFAULT_BUILDER(L2Norm2, l2_norm2);
     LBANN_REGISTER_DEFAULT_BUILDER(MeanAbsoluteError, mean_absolute_error);
@@ -330,6 +329,7 @@ std::unique_ptr<Layer> construct_layer_legacy(
     if (mode_str.empty() || mode_str == "classification") { target_mode = data_reader_target_mode::CLASSIFICATION; }
     if (mode_str == "regression")                         { target_mode = data_reader_target_mode::REGRESSION; }
     if (mode_str == "reconstruction")                     { target_mode = data_reader_target_mode::RECONSTRUCTION; }
+    if (mode_str == "label_reconstruction")               { target_mode = data_reader_target_mode::LABEL_RECONSTRUCTION; }
     if (mode_str == "na" || mode_str == "NA" || mode_str == "N/A") { target_mode = data_reader_target_mode::NA; }
     if (Layout != data_layout::DATA_PARALLEL) {
       LBANN_ERROR("input layer is only supported with "
@@ -657,6 +657,10 @@ std::unique_ptr<Layer> construct_layer_legacy(
   }
 
   // Loss layers
+  if (proto_layer.has_cross_entropy()) {
+    const auto& params = proto_layer.cross_entropy();
+    return lbann::make_unique<cross_entropy_layer<TensorDataType, Layout, Device>>(comm, params.use_labels());
+  }
   if (proto_layer.has_top_k_categorical_accuracy()) {
     const auto& params = proto_layer.top_k_categorical_accuracy();
     return lbann::make_unique<top_k_categorical_accuracy_layer<TensorDataType, Layout, Device>>(comm, params.k());
diff --git a/src/proto/layers.proto b/src/proto/layers.proto
index 9ccb656499a..8ec39f1b446 100644
--- a/src/proto/layers.proto
+++ b/src/proto/layers.proto
@@ -316,7 +316,9 @@ message Layer {
   ///////////////////////
   // Loss layers //
   ///////////////////////
-  message CrossEntropy {}
+  message CrossEntropy {
+    bool use_labels = 1; //default: false
+  }
   message MeanSquaredError {}
   message MeanAbsoluteError {}
   message CategoricalAccuracy {}
@@ -397,7 +399,7 @@ message Layer {
   //////////////////
   message Input {
     string io_buffer = 2;         // Options: "partitioned" (default)
-    string target_mode = 3;       // Options: "classification" (default), "regression", "reconstruction", "N/A"
+    string target_mode = 3;       // Options: "classification" (default), "regression", "reconstruction", "label_reconstruction", "N/A"
   }
 
   //////////////////////
diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp
index d3e3b128a80..ed3196e4c2e 100644
--- a/src/proto/proto_common.cpp
+++ b/src/proto/proto_common.cpp
@@ -187,15 +187,28 @@ void init_data_readers(
       reader_numpy_npz->set_scaling_factor_int16(readme.scaling_factor_int16());
       reader = reader_numpy_npz;
 #ifdef LBANN_HAS_DISTCONV
-    } else if (name=="cosmoflow_hdf5") {
-      auto* reader_cosmo_hdf5 = new hdf5_reader(shuffle);
+    } else if (name == "cosmoflow_hdf5" || name == "hdf5") {
+      if(name == "cosmoflow_hdf5") {
+        LBANN_WARNING("The \"cosmoflow_hdf5\" data reader is deprecated. Use \"hdf5\" instead.");
+      }
+      const auto key_data = readme.hdf5_key_data();
+      const auto key_labels = readme.hdf5_key_labels();
+      const auto key_responses = readme.hdf5_key_responses();
+      const auto hyperslab_labels = readme.hdf5_hyperslab_labels();
+      auto* reader_hdf5 = new hdf5_reader<DataType>(shuffle, key_data,
+                                                          key_labels,
+                                                          key_responses,
+                                                          hyperslab_labels);
+      reader_hdf5->set_has_labels(!readme.disable_labels());
+      reader_hdf5->set_has_responses(!readme.disable_responses());
+      reader_hdf5->set_num_responses(readme.num_responses());
       auto filedir = readme.data_filedir();
       if(!endsWith(filedir, "/")) {
         filedir = filedir + "/";
       }
       const auto paths = glob(filedir +readme.data_file_pattern());
-      reader_cosmo_hdf5->set_hdf5_paths(paths);
-      reader = reader_cosmo_hdf5;
+      reader_hdf5->set_hdf5_paths(paths);
+      reader = reader_hdf5;
 #endif // LBANN_HAS_DISTCONV
     } else if (name == "pilot2_molecular_reader") {
       pilot2_molecular_reader* reader_pilot2_molecular = new pilot2_molecular_reader(readme.num_neighbors(), readme.max_neighborhood(), shuffle);
diff --git a/src/proto/reader.proto b/src/proto/reader.proto
index 5f0fec50ba7..ce01e29dd84 100644
--- a/src/proto/reader.proto
+++ b/src/proto/reader.proto
@@ -94,6 +94,15 @@ message Reader {
   PythonDataReader python = 501;
 
   repeated Transform transforms = 600;  // Ordered list of transforms to apply.
+
+  //------------- start of only for HDF5 data reader ------------------
+  string hdf5_key_data      = 700;
+  string hdf5_key_labels    = 701;
+  string hdf5_key_responses = 702;
+  bool hdf5_hyperslab_labels = 703;
+  int32 num_responses = 704;
+  //------------- end of only for HDF5 data reader ------------------
+
 }
 
 message PythonDataReader {

From 918c2bb364ae1a6073990b846ec6442a0f0d84f2 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <mrwyattii@gmail.com>
Date: Wed, 23 Sep 2020 18:25:32 -0700
Subject: [PATCH 33/36] Updated docs, added quick start guide (#1625)

* added note about updating versions and information on LLNL pre-installed libraries

* updated developer install notes with missing step

* Added quickstart and updated index listing

* fixed error in dev install from previous commit

* Added info about viewing spack lbann options

Co-authored-by: Michael Wyatt <wyatt5@llnl.gov>
---
 docs/build_llnl_idiosyncracies.rst |   2 +-
 docs/building_lbann.rst            |   5 +-
 docs/index.rst                     |   1 +
 docs/quick_start.rst               | 368 +++++++++++++++++++++++++++++
 4 files changed, 373 insertions(+), 3 deletions(-)
 create mode 100644 docs/quick_start.rst

diff --git a/docs/build_llnl_idiosyncracies.rst b/docs/build_llnl_idiosyncracies.rst
index f5f3836ef0f..c94b4d7792d 100644
--- a/docs/build_llnl_idiosyncracies.rst
+++ b/docs/build_llnl_idiosyncracies.rst
@@ -24,7 +24,7 @@ this guide.
 
 Pre-installed Binary Packages
 ------------------------------
-
+.. comment:: need to update this section with newer versions or perhaps remove if this method is no longer used by developers
 The LC machines have many instances of cuDNN and NCCL installed in
 locations shared by the :code:`brain` group. These may be consistently
 detected by CMake by :code:`export`-ing their locations into the
diff --git a/docs/building_lbann.rst b/docs/building_lbann.rst
index c3053300fd4..7a29790af17 100644
--- a/docs/building_lbann.rst
+++ b/docs/building_lbann.rst
@@ -137,8 +137,9 @@ CMake flags known to LBANN's "Superbuild" build system.
     :code:`-e` flag. A full list of options can be viewed with the
     :code:`-h` flag.
 
-2.  Setup the LBANN CMake environment using the Spack environment for
-    the dependencies.
+2.  Setup the LBANN CMake environment using the Spack environment for the
+    dependencies.  If you used a custom Spack environment name in the step
+    above, be sure to specify that with the :code:`-e` option:
 
     .. code-block:: bash
 
diff --git a/docs/index.rst b/docs/index.rst
index bb56b6ccf56..40536b6f2a6 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -29,6 +29,7 @@ Users are advised to view `the Doxygen API Documentation
    :maxdepth: 2
    :caption: Getting Started
 
+   quick_start
    building_lbann
    running_lbann
    callbacks
diff --git a/docs/quick_start.rst b/docs/quick_start.rst
new file mode 100644
index 00000000000..26a0693470b
--- /dev/null
+++ b/docs/quick_start.rst
@@ -0,0 +1,368 @@
+.. role:: bash(code)
+          :language: bash
+
+====================
+Quick Start
+====================
+
+--------------------
+What can LBANN do?
+--------------------
+
+The Livermore Big Artificial Neural Network toolkit (LBANN) is an HPC-centric
+deep learning training framework that works across multiple levels of
+parallelism.  LBANN is capable of taking advantage of HPC hardware to
+accelerate the training of deep learning models on massive datasets.
+
+
+--------------------
+Installing LBANN
+--------------------
+
+LBANN supports installation through Spack and CMake.  We recommend using the
+Spack installation instructions below.  If the Spack install fails, try using
+the :ref:`CMake install <build-with-cmake>`.
+
+1.  Download and install `Spack <https://github.com/llnl/spack>`_.  Enable the
+    additional Spack commands for module files described `here
+    <https://spack.readthedocs.io/en/latest/module_file_support.html#id2>`_:
+
+    .. code-block:: bash
+
+        source ${SPACK_ROOT}/share/spack/setup-env.sh
+
+2.  Users that are `familiar with Spack
+    <https://spack-tutorial.readthedocs.io/en/latest/tutorial_basics.html>`_
+    and already have a `custom Spack ecosystem
+    <https://spack.readthedocs.io/en/latest/configuration.html>`_ can install
+    LBANN with:
+
+    .. code-block:: bash
+
+        spack install lbann <customization options>
+
+    A complete list of LBANN install options can be found with:
+    
+    .. code-block:: bash
+    
+        spack info lbann
+
+    For users new to Spack, LBANN provides a script that will perform some
+    basic configuration (e.g., add paths to externally installed packages) and
+    install LBANN in a Spack environment.  *This script is only tested and
+    maintained for systems at LLNL, NERSC, and ORNL.  If you are not running on
+    a system at one of these institutions, you may try the Spack install above
+    or the :ref:`CMake install <build-with-cmake>`.* To use this installation
+    script, clone the repository and run the script:
+
+    .. code-block:: bash
+
+        git clone https://github.com/llnl/lbann
+        cd ./lbann
+        ./scripts/install_lbann.sh -e lbann
+
+    View other options available by passing the :code:`-h` option to the
+    script.
+
+.. note:: It is recommended that your Spack environment take advantage
+          of locally installed tools.  Unless your Spack environment
+          is explicitly told about tools such as CMake, Python, MPI,
+          etc., it will install everything that LBANN and all of its
+          dependencies require. This can take quite a long time but
+          only has to be done once for a given spack repository. Once
+          all of the standard tools are installed, rebuilding LBANN
+          with Spack is quite fast.
+
+          Advice on setting up paths to external installations is
+          beyond the scope of this document but is covered in the
+          `Spack Documentation
+          <https://spack.readthedocs.io/en/latest/configuration.html>`_.
+
+
+--------------------
+Test LBANN Install
+--------------------
+
+1. If you used the :code:`install_lbann.sh` script for installation or
+   installed in a Spack environment, you will need to activate the Spack LBANN
+   environment:
+
+    .. code-block:: bash
+
+        spack env activate -p lbann
+
+2. Test an implementation of the `LeNet neural network
+   <http://yann.lecun.com/exdb/lenet/>`_ on the `MNIST data set
+   <https://en.wikipedia.org/wiki/MNIST_database>`_ at :code:`<lbann repo
+   path>/applications/vision/lenet.py` to verify that your LBANN installation
+   is working correctly:
+
+    .. code-block:: bash
+
+        cd <lbann repo path>/applications/vision/
+        python3 lenet.py
+
+    Running this Python script will automatically submit a job to the system
+    scheduler.  If LBANN was built successfully, you should see output from
+    LBANN about loading the data, building the network, and training the model.
+
+    If LBANN fails to run, you can view the generated job script and log files,
+    and run the job manually:
+
+    .. code-block:: bash
+
+        ls ./\*_lbann_lenet
+
+    If this also fails, you may try building LBANN again using the :ref:`CMake
+    install instructions <build-with-cmake>`.
+
+
+--------------------
+Basic Usage
+--------------------
+
+A typical workflow involves the following steps:
+
+1. Configuring a :python:`Trainer`.
+
+2. Configuring LBANN model components (like the graph of
+   :python:`Layer` s) and creating a :python:`Model`.
+
+  + Classes for model components are automatically generated from the
+    LBANN Protobuf specifications in `lbann/src/proto
+    <https://github.com/LLNL/lbann/blob/develop/src/proto>`_. These
+    files are currently the best source of documentation. Message
+    fields in the Protobuf specification are optional keyword
+    arguments for the corresponding Python class constructor. If a
+    keyword argument is not provided, it is logically zero (e.g. false
+    for Boolean fields and empty for string fields)
+
+3. Configuring the default :python:`Optimizer` to be used by the
+   :python:`Weights` objects.
+
+4. Loading in a Protobuf text file describing the data reader.
+
+   + The Python frontend currently does not have good support for
+     specifying data readers. If any data reader properties need to be
+     set programmatically, the user must do it directly via the
+     Protobuf Python API.
+
+5. Launching LBANN by calling :python:`run`.
+
+   + :python:`lbann.run` should be run from a compute node. If a node
+     allocation is not available, the :python:`batch_job` option can
+     be set to submit a batch job to the scheduler.
+
+   + A timestamped work directory will be created each time LBANN is
+     run. The default location of these work directories can be set
+     with the environment variable :bash:`LBANN_EXPERIMENT_DIR`.
+
+   + Supported job managers are Slurm and LSF.
+
+   + LLNL users and collaborators may prefer to use
+     :python:`lbann.contrib.launcher.run`. This is similar to
+     :python:`lbann.run`, with defaults and optimizations for certain
+     systems.
+
+
+--------------------
+PyTorch to LBANN
+--------------------
+
+The LBANN Python API is very similar to the PyTorch API.  In order to help
+users familiar with PyTorch transition to LBANN, we prepared the following
+guide:
+
+~~~~~~~~~~~~~~~~~~~~
+Loading Data
+~~~~~~~~~~~~~~~~~~~~
+Both LBANN and PyTorch use similar strategies for loading data into models.
+With PyTorch, we can load the `MNIST dataset
+<https://en.wikipedia.org/wiki/MNIST_database>`_ using the included
+:python:`DataLoader`:
+
+    .. code-block:: python
+
+        import torch
+        from torchvision import datasets, transforms
+
+        batch_size = 64
+        data_loader = torch.utils.data.DataLoader(
+                      datasets.MNIST('data', train=True, download=True,
+                                     transform=transforms.ToTensor()),
+                      batch_size=batch_size)
+
+With LBANN, you can write custom data reader functions that use protobuf files
+to define the input data and transform it into the input tensors for your
+model:
+
+    .. code-block:: python
+
+        import os
+        import lbann
+        from google.protobuf import text_format
+
+        def make_data_reader(data_dir):
+            protobuf_file = os.path.join(data_dir, 'data_reader.prototext')
+            message = lbann.lbann_pb2.LbannPB()
+            with open(protobuf_file, 'r') as f:
+                text_format.Merge(f.read(), message)
+            message = message.data_reader
+            message.reader[0].data_filedir = data_dir
+
+            return message
+
+        data_reader = make_data_reader(os.path.realpath('./mnist_data/'))
+
+This reader assumes that the files `train-images-idx3-ubyte
+<http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz>`_,
+`train-labels-idx1-ubyte
+<http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz>`_, and
+:code:`data_reader.prototext` are located in the :bash:`./mnist_data`
+directory.  The :code:`data_read.prototext` file contains the following:
+
+    .. code-block:: protobuf
+
+        data_reader {
+          reader {
+            name: "mnist"
+            role: "train"
+            shuffle: true
+            data_filedir: "mnist_data"
+            data_filename: "train-images-idx3-ubyte"
+            label_filename: "train-labels-idx1-ubyte"
+            validation_percent: 0.1
+            percent_of_data_to_use: 1.0
+            transforms {
+              scale {
+                scale: 0.003921568627  # 1/255
+              }
+            }
+          }
+        }
+
+~~~~~~~~~~~~~~~~~~~~
+Building a Model
+~~~~~~~~~~~~~~~~~~~~
+
+Building models in LBANN is similar to building models in PyTorch.
+For example, we can define a simple PyTorch model for the MNIST dataset with:
+
+    .. code-block:: python
+
+        import torch.nn as nn
+        import torch.nn.functional as F
+
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.conv = nn.Conv2d(1, 20, kernel_size=5)
+                self.fc = nn.Linear(12*12*20, 10)
+
+            def forward(self, x):
+                x = self.conv(x)
+                x = F.relu(x)
+                x = F.max_pool2d(x, 2)
+                x = x.view(x.size(0), -1)
+                x = self.fc(x)
+                x = F.log_softmax(x, dim=1)
+                return x
+
+        net = Net()
+
+
+Using LBANN, that same neural network can be built with:
+
+    .. code-block:: python
+
+        input_ = lbann.Input()
+        images = lbann.Identity(input_)
+        labels = lbann.Identity(input_)
+
+        x = lbann.Convolution(images, num_dims=2, num_output_channels=20,
+                              num_groups=1, conv_dims_i=5, conv_strides_i=1,
+                              conv_dilations_i=1, has_bias=True)
+        x = lbann.Relu(x)
+        x = lbann.Pooling(x, num_dims=2, pool_dims_i=2,
+                          pool_strides_i=2, pool_mode='max')
+        x = lbann.FullyConnected(x, num_neurons=10, has_bias=True)
+        probs = lbann.Softmax(x)
+
+        loss = lbann.CrossEntropy(probs, labels)
+
+        model = lbann.Model(epochs=5,
+                            layers=lbann.traverse_layer_graph(input_),
+                            objective_function=loss,
+                            callbacks=[lbann.CallbackPrintModelDescription(),
+                                       lbann.CallbackPrint()])
+
+~~~~~~~~~~~~~~~~~~~~
+Setup Model Training
+~~~~~~~~~~~~~~~~~~~~
+
+Training a model with PyTorch can be achieved by setting a few parameters,
+defining an optimizer, and building a training loop:
+
+    .. code-block:: python
+
+        import torch.optim as optim
+
+        learning_rate = 0.01
+        momentum = 0.5
+
+        opt = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
+
+        def train(epoch):
+            net.train()
+            for batch_idx, (data, target) in enumerate(data_loader):
+                opt.zero_grad()
+                output = net(data)
+                loss = F.nll_loss(output, target)
+                loss.backward()
+                opt.step()
+
+            print('Training Epoch: {},\tLoss: {:.3f}'.format(epoch, loss.item()))
+
+With LBANN, we also define learning parameterrs and an optimizer.  With LBANN,
+a :python:`Trainer` is provided that negates the need to build your own
+training loop:
+
+    .. code-block:: python
+
+        learning_rate = 0.01
+        momentum = 0.5
+        batch_size = 64
+
+        opt = lbann.SGD(learn_rate=learning_rate, momentum=momentum)
+
+        trainer = lbann.Trainer(mini_batch_size=batch_size)
+
+~~~~~~~~~~~~~~~~~~~~
+Run the Experiment
+~~~~~~~~~~~~~~~~~~~~
+
+Running the experiment in PyTorch is as simple as calling the training loop:
+
+    .. code-block:: python
+
+        for epoch in range(5):
+            train(epoch)
+
+Running the experiment in LBANN is just as easy:
+
+    .. code-block:: python
+
+        import lbann.contrib.launcher
+        lbann.contrib.launcher.run(trainer, model, data_reader,
+                                   opt, job_name='mnist-test')
+
+Python acts only as a frontend for LBANN.  The above commands will
+automatically generate a batch job script and submit it to the system
+scheduler.  You can see the job script and associated job files in the
+:bash:`./*mnist-test/` directory.
+
+.. note:: The LBANN :python:`launcher.run` can accept additional arguments to
+          specify additional scheduler and job parameters.  LBANN provides
+          methods that help with these parameters at
+          :python:`lbann.contrib.args.add_scheduler_arguments()` and
+          :python:`lbann.contrib.args.get_scheduler_kwargs()`.

From 3927736de120546489fbf57090ca649323722e3d Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Wed, 23 Sep 2020 18:30:05 -0700
Subject: [PATCH 34/36] Integrated fixes for Ray's build environment from
 szaman19 (#1639)

---
 .../llnl_lc/externals-linux-rhel7-power8le.sh        | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh b/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
index 0f8e9bd6301..58488475230 100644
--- a/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
@@ -8,7 +8,7 @@ EXTERNAL_ALL_PACKAGES=$(cat <<EOF
         lapack:
           - openblas threads=openmp
         blas:
-          - openblas threasd=openmp
+          - openblas threads=openmp
       buildable: true
       version: []
 EOF
@@ -29,18 +29,18 @@ EXTERNAL_PACKAGES=$(cat <<EOF
       version:
       - 10.2.89
       externals:
-      - spec: cuda@10.2.89 arch=linux-rhel7-power8le
+      - spec: cuda@10.1.168 arch=linux-rhel7-power8le
         modules:
-        - cuda/10.2.89
+        - cuda/10.1.168
     cudnn::
       buildable: true
       version:
       - 7.6.5.32-10.1-linux-ppc64le
     gcc::
-       buildable: False
-       version:
+      buildable: False
+      version:
       - 7.3.1
-       externals:
+      externals:
       - spec:  gcc@7.3.1 arch=linux-rhel7-power8le
         modules:
         - gcc/7.3.1

From 97aa4d855435c8411302ed0bb81f998963be6a9e Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Tue, 29 Sep 2020 15:48:52 -0700
Subject: [PATCH 35/36] Updated release notes for upcoming release v0.101.
 (#1640)

* Updated release notes for upcoming release v0.101.

* Fixed typo
---
 ReleaseNotes.txt | 69 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt
index 1ebd8e4a2b8..cea20e3b687 100644
--- a/ReleaseNotes.txt
+++ b/ReleaseNotes.txt
@@ -21,6 +21,75 @@ Bug fixes:
 
 Retired features:
 
+============================== (Pending) Release Notes: v0.101 ==============================
+
+Support for new training algorithms:
+
+Support for new network structures:
+ - ATOM VAE model
+ - Graph neural networks
+ - Graph Convolutional Networks (GCN)
+ - 3D U-Net Model
+
+Support for new layers:
+ - Implemented optimized GRU layer using cuDNN kernel
+ - Graph Layers: GCN, GIN, Graph, GatedGraph
+
+Python front-end:
+ - Support for Graph and Graph Convolutional Networks
+ - Added support for OCLF data center (Summit)
+
+Performance optimizations:
+ - Optimize CUDA kernel for tensor reordering in GRU layer
+ - Enabled TensorCore optimization for GRU layer
+ - GCN and Graph layers also have a faster Dense variant which only utilizes Matrix Multiplication
+
+Model portability & usability:
+ - Added Users Quickstart section to documentation including PyTorch
+   to LBANN mini-tutorial
+ - Added section on callbacks with detailed instructions on summarize
+   images callback
+
+Internal features:
+ - Support for double data type in distributed embedding layer
+ - Support for large number of channels in GPU batchnorm layer
+ - Modified LTFB so that NaNs lose tournaments
+ - Improved numerical stability of reconstruction loss in ATOM VAE
+   model
+ - Skip bad gradients in Adam
+
+I/O & data readers:
+ - Added support for ImageNet data reader to use sample lists
+ - Refactored sample list code to be more flexible and generalize
+   beyond JAG data reader
+ - Added support for slab-based I/O in HDF5 data reader required by
+   DistConv implementations of CosmoFlow 3D volumes
+ - Extended slab-based HDF5 data reader to support labels and
+   reconstruction modes for use with U-Net architecture
+
+Datasets:
+ - Added two graph datasets (MNIST, and PROTEINS)
+
+Build system and Dependent Libraries:
+ - Hydrogen 1.4.0
+ - Aluminum 0.4.0
+ - Spack v0.15.4+ (Requires new format for environments)
+ - cuDNN 8.0.2
+ - Require C++14
+ - Added Spack build support for OCLF data center (Summit)
+
+Bug fixes:
+ - Properly reset data coordinator after each LTFB round
+ - Fixed bug in weights proxy when weights buffer is reallocated
+ - Bugfix for smiles data reader bound checking and simple LTFB data
+   distribution
+ - Eliminated a race condition observed in VAE ATOM model with SMILES
+   data reader.  Added a barrier after each data store mini-batch
+   exchange -- avoid race between non-blocking sends and receives and
+   later GPU kernel communication.
+
+Retired features:
+
 ============================== Release Notes: v0.100 ==============================
 Support for new network structures:
  - 3D molecular generation models for Metal Organic Frameworks from the CoRE MOF Database.

From 6a0f8bfc44b8bfa0dad4f05c09598ab00c24f86b Mon Sep 17 00:00:00 2001
From: "Brian C. Van Essen" <vanessen1@llnl.gov>
Date: Tue, 29 Sep 2020 15:59:02 -0700
Subject: [PATCH 36/36] Release branch for v0.101

============================== Release Notes: v0.101 ==============================

Support for new training algorithms:

Support for new network structures:
 - ATOM VAE model
 - Graph neural networks
 - Graph Convolutional Networks (GCN)
 - 3D U-Net Model

Support for new layers:
 - Implemented optimized GRU layer using cuDNN kernel
 - Graph Layers: GCN, GIN, Graph, GatedGraph

Python front-end:
 - Support for Graph and Graph Convolutional Networks
 - Added support for OCLF data center (Summit)

Performance optimizations:
 - Optimize CUDA kernel for tensor reordering in GRU layer
 - Enabled TensorCore optimization for GRU layer
 - GCN and Graph layers also have a faster Dense variant which only utilizes Matrix Multiplication

Model portability & usability:
 - Added Users Quickstart section to documentation including PyTorch
   to LBANN mini-tutorial
 - Added section on callbacks with detailed instructions on summarize
   images callback

Internal features:
 - Support for double data type in distributed embedding layer
 - Support for large number of channels in GPU batchnorm layer
 - Modified LTFB so that NaNs lose tournaments
 - Improved numerical stability of reconstruction loss in ATOM VAE
   model
 - Skip bad gradients in Adam

I/O & data readers:
 - Added support for ImageNet data reader to use sample lists
 - Refactored sample list code to be more flexible and generalize
   beyond JAG data reader
 - Added support for slab-based I/O in HDF5 data reader required by
   DistConv implementations of CosmoFlow 3D volumes
 - Extended slab-based HDF5 data reader to support labels and
   reconstruction modes for use with U-Net architecture

Datasets:
 - Added two graph datasets (MNIST, and PROTEINS)

Build system and Dependent Libraries:
 - Hydrogen 1.4.0
 - Aluminum 0.4.0
 - Spack v0.15.4+ (Requires new format for environments)
 - cuDNN 8.0.2
 - Require C++14
 - Added Spack build support for OCLF data center (Summit)

Bug fixes:
 - Properly reset data coordinator after each LTFB round
 - Fixed bug in weights proxy when weights buffer is reallocated
 - Bugfix for smiles data reader bound checking and simple LTFB data
   distribution
 - Eliminated a race condition observed in VAE ATOM model with SMILES
   data reader.  Added a barrier after each data store mini-batch
   exchange -- avoid race between non-blocking sends and receives and
   later GPU kernel communication.

Retired features:
---
 CMakeLists.txt   | 2 +-
 ReleaseNotes.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ed8c4b5250..0dd34bc2cef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,7 +48,7 @@ endif ()
 #
 
 set(LBANN_VERSION_MAJOR 0)
-set(LBANN_VERSION_MINOR 100)
+set(LBANN_VERSION_MINOR 101)
 set(LBANN_VERSION_PATCH 0)
 
 set(LBANN_VERSION "${LBANN_VERSION_MAJOR}.${LBANN_VERSION_MINOR}.${LBANN_VERSION_PATCH}")
diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt
index cea20e3b687..b5b4f96b4d6 100644
--- a/ReleaseNotes.txt
+++ b/ReleaseNotes.txt
@@ -21,7 +21,7 @@ Bug fixes:
 
 Retired features:
 
-============================== (Pending) Release Notes: v0.101 ==============================
+============================== Release Notes: v0.101 ==============================
 
 Support for new training algorithms: