diff --git a/CMakeLists.txt b/CMakeLists.txt index 807adfb27da..0dd34bc2cef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,9 +26,9 @@ if (NOT DEFINED BUILD_SHARED_LIBS) set(BUILD_SHARED_LIBS ON) endif () -# Build with at least C++11 standard; allow newer standards. +# Build with at least C++14 standard; allow newer standards. if (NOT CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD EQUAL 98) - set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED TRUE) endif () @@ -48,7 +48,7 @@ endif () # set(LBANN_VERSION_MAJOR 0) -set(LBANN_VERSION_MINOR 100) +set(LBANN_VERSION_MINOR 101) set(LBANN_VERSION_PATCH 0) set(LBANN_VERSION "${LBANN_VERSION_MAJOR}.${LBANN_VERSION_MINOR}.${LBANN_VERSION_PATCH}") @@ -188,16 +188,20 @@ set(LBANN_HAS_CEREAL ${CEREAL_FOUND}) # The imported target is just called "cereal". Super. # Setup the linear algebra library -find_package(Hydrogen 1.3.3 NO_MODULE QUIET +find_package(Hydrogen 1.4.0 NO_MODULE QUIET HINTS ${Hydrogen_DIR} ${HYDROGEN_DIR} $ENV{Hydrogen_DIR} $ENV{HYDROGEN_DIR} PATH_SUFFIXES lib/cmake/hydrogen NO_DEFAULT_PATH) if (NOT Hydrogen_FOUND) - find_package(Hydrogen 1.3.3 NO_MODULE QUIET REQUIRED) + find_package(Hydrogen 1.4.0 NO_MODULE QUIET REQUIRED) endif () message(STATUS "Found Hydrogen: ${Hydrogen_DIR}") set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND}) +if (_HYDROGEN_HAVE_ROCM) + message(FATAL_ERROR "ROCm not yet supported in LBANN.") +endif () + # DiHydrogen and Distconv if (LBANN_WITH_DISTCONV AND NOT LBANN_WITH_DIHYDROGEN) message(FATAL_ERROR "Distconv requires DiHydrogen. Enable DiHydrogen to use Distconv.") @@ -260,7 +264,7 @@ if (LBANN_HAS_CUDA) enable_language(CUDA) if (NOT CMAKE_CUDA_STANDARD OR CMAKE_CUDA_STANDARD EQUAL 98) - set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD 14) endif () set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) @@ -271,13 +275,13 @@ if (LBANN_WITH_ALUMINUM) if (NOT Aluminum_FOUND) message(WARNING "Using Aluminum without Hydrogen support may not be well-supported.") - find_package(Aluminum 0.3.0 NO_MODULE QUIET + find_package(Aluminum 0.4.0 NO_MODULE QUIET HINTS ${Aluminum_DIR} ${ALUMINUM_DIR} ${AL_DIR} $ENV{Aluminum_DIR} $ENV{ALUMINUM_DIR} $ENV{AL_DIR} PATH_SUFFIXES lib64/cmake/aluminum lib/cmake/aluminum NO_DEFAULT_PATH) if (NOT Aluminum_FOUND) - find_package(Aluminum 0.3.0 NO_MODULE QUIET) + find_package(Aluminum 0.4.0 NO_MODULE QUIET) endif () endif () set(LBANN_HAS_ALUMINUM ${Aluminum_FOUND}) diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt index 1ebd8e4a2b8..b5b4f96b4d6 100644 --- a/ReleaseNotes.txt +++ b/ReleaseNotes.txt @@ -21,6 +21,75 @@ Bug fixes: Retired features: +============================== Release Notes: v0.101 ============================== + +Support for new training algorithms: + +Support for new network structures: + - ATOM VAE model + - Graph neural networks + - Graph Convolutional Networks (GCN) + - 3D U-Net Model + +Support for new layers: + - Implemented optimized GRU layer using cuDNN kernel + - Graph Layers: GCN, GIN, Graph, GatedGraph + +Python front-end: + - Support for Graph and Graph Convolutional Networks + - Added support for OCLF data center (Summit) + +Performance optimizations: + - Optimize CUDA kernel for tensor reordering in GRU layer + - Enabled TensorCore optimization for GRU layer + - GCN and Graph layers also have a faster Dense variant which only utilizes Matrix Multiplication + +Model portability & usability: + - Added Users Quickstart section to documentation including PyTorch + to LBANN mini-tutorial + - Added section on callbacks with detailed instructions on summarize + images callback + +Internal features: + - Support for double data type in distributed embedding layer + - Support for large number of channels in GPU batchnorm layer + - Modified LTFB so that NaNs lose tournaments + - Improved numerical stability of reconstruction loss in ATOM VAE + model + - Skip bad gradients in Adam + +I/O & data readers: + - Added support for ImageNet data reader to use sample lists + - Refactored sample list code to be more flexible and generalize + beyond JAG data reader + - Added support for slab-based I/O in HDF5 data reader required by + DistConv implementations of CosmoFlow 3D volumes + - Extended slab-based HDF5 data reader to support labels and + reconstruction modes for use with U-Net architecture + +Datasets: + - Added two graph datasets (MNIST, and PROTEINS) + +Build system and Dependent Libraries: + - Hydrogen 1.4.0 + - Aluminum 0.4.0 + - Spack v0.15.4+ (Requires new format for environments) + - cuDNN 8.0.2 + - Require C++14 + - Added Spack build support for OCLF data center (Summit) + +Bug fixes: + - Properly reset data coordinator after each LTFB round + - Fixed bug in weights proxy when weights buffer is reallocated + - Bugfix for smiles data reader bound checking and simple LTFB data + distribution + - Eliminated a race condition observed in VAE ATOM model with SMILES + data reader. Added a barrier after each data store mini-batch + exchange -- avoid race between non-blocking sends and receives and + later GPU kernel communication. + +Retired features: + ============================== Release Notes: v0.100 ============================== Support for new network structures: - 3D molecular generation models for Metal Organic Frameworks from the CoRE MOF Database. diff --git a/applications/ATOM/models/vae.py b/applications/ATOM/models/vae.py new file mode 100644 index 00000000000..490f156f872 --- /dev/null +++ b/applications/ATOM/models/vae.py @@ -0,0 +1,368 @@ +import math +import lbann +import lbann.modules +from lbann.util import make_iterable + +def str_list(l): + """Convert an iterable object to a space-separated string.""" + return ' '.join(str(i) for i in make_iterable(l)) + +class GRUModule(lbann.modules.Module): + + global_count = 0 # Static counter, used for default names + + def __init__( + self, + hidden_size, + num_layers=1, + weights=[], + name=None, + device=None, + datatype=None, + weights_datatype=None, + ): + GRUModule.global_count += 1 + self.instance = 0 + self.hidden_size = hidden_size + self.num_layers = num_layers + self.name = name if name else f'gru{GRUModule.global_count}' + self.device = device + self.datatype = datatype + + # Construct weights if needed + self.weights = weights + if not self.weights: + scale = 1 / math.sqrt(self.hidden_size) + init = lbann.UniformInitializer(min=-scale,max=scale) + if weights_datatype is None: + weights_datatype = self.datatype + self.weights = [] + for i in range(self.num_layers): + self.weights.extend( + lbann.Weights( + initializer=init, + name=f'{self.name}_layer{i}_{weight_name}', + datatype=weights_datatype, + ) + for weight_name in ('ih_matrix', 'hh_matrix', 'ih_bias', 'hh_bias') + ) + if self.weights and len(self.weights) != 4*self.num_layers: + raise ValueError( + f'expected {4*self.num_layers} weights, ' + f'but recieved {len(self.weights)}' + ) + + # Default initial hidden state + self.zeros = lbann.Constant( + value=0, + num_neurons=str(hidden_size), + name=f'{self.name}_zeros', + device=self.device, + datatype=self.datatype, + ) + + def forward(self, x, h=None): + self.instance += 1 + name = f'{self.name}_instance{self.instance}' + + # Initial hidden state + if not h: + h = [self.zeros] * self.num_layers + if not isinstance(h, list) or len(h) != self.num_layers: + raise ValueError( + f'expected `h` to be a list with {self.num_layers} layers' + ) + + # Stacked GRU + ### @todo Replace with single GRU once LBANN supports stacked GRUs + for i in range(self.num_layers): + x = lbann.GRU( + x, + h[i], + hidden_size=self.hidden_size, + name=f'{name}_layer{i}', + weights=self.weights[4*i:4*(i+1)], + device=self.device, + datatype=self.datatype, + ) + return x + +class MolVAE(lbann.modules.Module): + """Molecular VAE. + + See: + https://github.com/samadejacobs/moses/tree/master/moses/vae + + """ + + global_count = 0 # Static counter, used for default names + + def __init__(self, input_feature_dims,dictionary_size, embedding_size, ignore_label, name=None): + """Initialize Molecular VAE. + + Args: + input_feature_dims (int): analogous to sequence length. + dictionary_size (int): vocabulary size + embedding_size (int): embedding size + ignore_label (int): padding index + name (str, optional): Module name + (default: 'molvae_module'). + + """ + MolVAE.global_count += 1 + self.instance = 0 + self.name = (name if name + else 'molvae_module{0}'.format(MolVAE.global_count)) + + self.input_feature_dims = input_feature_dims + self.embedding_size = embedding_size + self.dictionary_size = dictionary_size + self.label_to_ignore = ignore_label + self.datatype = lbann.DataType.FLOAT + self.weights_datatype = lbann.DataType.FLOAT + + fc = lbann.modules.FullyConnectedModule + gru = GRUModule + + #Encoder + self.encoder_rnn = gru( + hidden_size=256, + name=self.name+'_encoder_rnn', + datatype=self.datatype, + weights_datatype=self.weights_datatype, + ) + self.q_mu = fc(128,name=self.name+'_encoder_qmu') + self.q_logvar = fc(128,name=self.name+'_encoder_qlogvar') + for w in self.q_mu.weights + self.q_logvar.weights: + w.datatype = self.weights_datatype + + #Decoder + self.decoder_rnn = gru( + hidden_size=512, + num_layers=3, + name=self.name+'_decoder_rnn', + datatype=self.datatype, + weights_datatype=self.weights_datatype, + ) + self.decoder_lat = fc(512, name=self.name+'_decoder_lat') + self.decoder_fc = fc(self.dictionary_size, name=self.name+'_decoder_fc') + for w in self.decoder_lat.weights + self.decoder_fc.weights: + w.datatype = self.weights_datatype + self.decoder_fc.weights[0].initializer = lbann.NormalInitializer( + mean=0, standard_deviation=1/math.sqrt(512)) + + #shared encoder/decoder weights + self.emb_weights = lbann.Weights( + initializer=lbann.NormalInitializer(mean=0, standard_deviation=1), + name='emb_matrix', + datatype=self.weights_datatype, + ) + + def forward(self, x): + """Do the VAE forward step + + :param x: list of tensors of longs, embed representation of input + :return: float, kl term component of loss + :return: float, recon component of loss + """ + + x = lbann.Slice(x, slice_points=str_list([0, self.input_feature_dims])) + x = lbann.Identity(x) + x_emb = lbann.Embedding( + x, + num_embeddings=self.dictionary_size, + embedding_dim=self.embedding_size, + name='emb', + weights=self.emb_weights + ) + + # Encoder: x -> z, kl_loss + z, kl_loss = self.forward_encoder(x_emb) + + # Decoder: x, z -> recon_loss + pred = self.forward_decoder(x_emb, z) + recon_loss = self.compute_loss(x, pred) + + # Hack to remove blocking GPU allreduce in evaluation layer + kl_loss = lbann.Identity(kl_loss, device='CPU') + recon_loss = lbann.Identity(recon_loss, device='CPU') + + return kl_loss, recon_loss + + def forward_encoder(self, x_emb): + """Encoder step, emulating z ~ E(x) = q_E(z|x) + + :param x_emb: (n_batch, len(x), d_z) of floats, embeddings for input sentence x + :return: (n_batch, d_z) of floats, sample of latent vector z + :return: float, kl term component of loss + """ + + # _, h = self.encoder_rnn(x, None) + h = self.encoder_rnn(x_emb, None) + + h = lbann.Slice( + h, + slice_points=str_list([self.input_feature_dims-1, + self.input_feature_dims]), + axis=0, + ) + h = lbann.Identity(h) + + mu, logvar = self.q_mu(h), self.q_logvar(h) + + # Set datatype of previous layers + # Note: Depth-first search from mu and logvar to x_emb + stack = [mu, logvar] + in_stack = {l : True for l in stack} + while stack: + l = stack.pop() + if type(l) not in (lbann.Slice, lbann.Reshape, lbann.Tessellate): + l.datatype = self.datatype + for parent in l.parents: + if parent not in in_stack and parent is not x_emb: + stack.append(parent) + in_stack[parent] = True + + # eps = torch.randn_like(mu) + eps = lbann.Gaussian(mean=0, stdev=1,hint_layer=mu) + + # z = mu + (logvar / 2).exp() * eps + z = lbann.Add([mu, (lbann.Multiply([lbann.Exp(lbann.WeightedSum(logvar,scaling_factors='0.5')),eps]))]) + + # kl_loss = 0.5 * (logvar.exp() + mu ** 2 - 1 - logvar).sum(1).mean() + kl_loss = lbann.Reduction( + lbann.WeightedSum( + lbann.Exp(logvar), + lbann.Square(mu), + self.constant(1, hint_layer=mu), + logvar, + scaling_factors='0.5 0.5 -0.5 -0.5', + ), + mode='sum', + ) + + return z, kl_loss + + def forward_decoder(self, x_emb, z): + """Decoder step, emulating x ~ G(z) + + :param x_emb: (n_batch, len(x), d_z) of floats, embeddings for input sentence x + :param z: (n_batch, d_z) of floats, latent vector z + :return: float, recon component of loss + :return: list of ints, reconstructed sentence + """ + + # z_0 = z.unsqueeze(1).repeat(1, x_emb.size(1), 1) + # x_input = torch.cat([x_emb, z_0], dim=-1) + z_0 = lbann.Tessellate( + lbann.Reshape(z, dims=str_list([1, 128])), + dims=str_list([self.input_feature_dims, 128]), + ) + x_input = lbann.Concatenation(x_emb, z_0, axis=1) + + h_0 = self.decoder_lat(z) + + # output, _ = self.decoder_rnn(x_input, h_0) + output = self.decoder_rnn(x_input, [h_0, h_0, h_0]) + + # y = self.decoder_fc(output) + y = lbann.ChannelwiseFullyConnected( + output, + output_channel_dims=self.dictionary_size, + bias=True, + name=f'{self.decoder_fc.name}', + weights=self.decoder_fc.weights, + ) + + # Set datatype of layers + # Note: Depth-first search from y to x_emb and z + stack = [y] + in_stack = {l : True for l in stack} + while stack: + l = stack.pop() + if type(l) not in (lbann.Slice, lbann.Reshape, lbann.Tessellate): + l.datatype = self.datatype + for parent in l.parents: + if parent not in in_stack and parent not in (x_emb, z): + stack.append(parent) + in_stack[parent] = True + + return y + + def compute_loss(self, x, y): + + # y[:, :-1] + y = lbann.Slice( + y, + axis=0, + slice_points=str_list([0, self.input_feature_dims-1]), + ) + y = lbann.Identity(y) + + # x[:, 1:] + x = lbann.Slice( + x, + slice_points=str_list([1, self.input_feature_dims]), + ) + x = lbann.Identity(x) + + # Convert indices in x to one-hot representation + # Note: Ignored indices result in zero vectors + ignore_mask = lbann.Equal( + x, + self.constant(self.label_to_ignore, hint_layer=x), + ) + keep_mask = lbann.LogicalNot(ignore_mask) + length = lbann.Reduction(keep_mask, mode='sum') + length = lbann.Max(length, self.constant(1, [1])) + x = lbann.Add( + lbann.Multiply(keep_mask, x), + lbann.Multiply(ignore_mask, self.constant(-1, hint_layer=x)), + ) + x = lbann.Slice(x, slice_points=str_list(range(self.input_feature_dims))) + x = [lbann.Identity(x) for _ in range(self.input_feature_dims-1)] + x = [lbann.OneHot(xi, size=self.dictionary_size) for xi in x] + x = [lbann.Reshape(xi, dims=str_list([1, self.dictionary_size])) for xi in x] + x = lbann.Concatenation(x, axis=0) + + # recon_loss = F.cross_entropy( + # y[:, :-1].contiguous().view(-1, y.size(-1)), + # x[:, 1:].contiguous().view(-1), + # ignore_index=self.pad + # ) + # Note: Ideally we'd shift y by y.max(-1) for numerical stability + shifts = lbann.MatMul( + lbann.Max(y, self.constant(0, hint_layer=y)), + self.constant( + 1 / math.sqrt(self.dictionary_size), + [self.dictionary_size, self.dictionary_size], + ), + ) + y = lbann.Subtract(y, shifts) + z = lbann.MatMul( + lbann.Exp(y), + self.constant(1, [self.dictionary_size, 1]), + ) + z = lbann.Log(z) + z = lbann.MatMul( + lbann.Reshape(keep_mask, dims=str_list([1, -1])), + z, + ) + recon_loss = lbann.MatMul( + lbann.Reshape(y, dims=str_list([1, -1])), + lbann.Reshape(x, dims=str_list([1, -1])), + transpose_b=True, + ) + recon_loss = lbann.Subtract(z, recon_loss) + recon_loss = lbann.Reshape(recon_loss, dims=str_list([1])) + recon_loss = lbann.Divide(recon_loss, length) + + return recon_loss + + def constant(self, value, dims=[], datatype=None, hint_layer=None): + return lbann.Constant( + value=value, + num_neurons=str_list(dims), + datatype=datatype, + hint_layer=hint_layer, + ) diff --git a/applications/ATOM/train_atom_vae.py b/applications/ATOM/train_atom_vae.py new file mode 100644 index 00000000000..15e36270de7 --- /dev/null +++ b/applications/ATOM/train_atom_vae.py @@ -0,0 +1,276 @@ +import argparse +import datetime +import os +import os.path +import sys + +from google.protobuf import text_format as txtf +import json +import numpy as np +import models.vae as molvae + +import lbann +import lbann.contrib.launcher +import lbann.modules +from lbann.util import str_list + +def list2str(l): + return ' '.join(l) + +def construct_lc_launcher_args(): + + # defaults correspond to the settings needed for training on the moses dataset + parser = argparse.ArgumentParser(prog="lbann ATOM VAE training") + parser.add_argument("--partition", default=None) + parser.add_argument("--account", default="hpcdl") + parser.add_argument("--scheduler", type=str, default="slurm") + parser.add_argument( + "--data-module-file", + default="dataset.py", + help="specifies the module that contains the logic for loading data", + ) + parser.add_argument( + "--data-config", + default=os.path.join( + os.path.abspath(os.path.dirname(__file__)), "zinc_data_config.json" + ), + help="path to a data config file that is used for the construction of python data reader", + ) + parser.add_argument( + "--time-limit", + type=int, + default=720, + help="specified time limit in number of minutes", + ) + parser.add_argument("--nodes", type=int, default=1) + parser.add_argument("--job-name", default="atom_vae") + parser.add_argument("--embedding-dim", type=int, default=None) + parser.add_argument("--num-embeddings", type=int, default=None) + parser.add_argument("--batch-size", type=int, default=512) + parser.add_argument("--num-epochs", type=int, default=20) + parser.add_argument("--data-reader-prototext", default=None) + parser.add_argument("--data-filedir", default=None) + parser.add_argument("--data-filename", default=None) + parser.add_argument("--pad-index", type=int, default=None) + parser.add_argument("--sequence-length", type=int, default=None) + parser.add_argument("--dump-weights-dir", type=str, default="weights") + parser.add_argument("--dump-weights-interval", type=int, default=10) + parser.add_argument("--num-samples", type=int, default=None) + parser.add_argument("--num-io-threads", type=int, default=11) + parser.add_argument("--vocab", default=None) + parser.add_argument("--delimiter", default="c") + parser.add_argument("--no-header", type=bool, default=True) + parser.add_argument("--ltfb", type=bool, default=False) + parser.add_argument("--ltfb-batch-interval", type=int, default=100) + parser.add_argument("--weights-to-send", type=str, default='') + + # these are specific to the Trainer object + parser.add_argument( + "--procs-per-trainer", + type=int, + default=0, + help="number of processes to use per trainer", + ) + parser.add_argument( + "--lr", + type=float, + default=3e-4, + help="optimizer learning rate to use for training", + ) + return parser.parse_args() + +# ============================================== +# Setup and launch experiment +# ============================================== + +def construct_model(run_args): + """Construct LBANN model. + + Initial model for ATOM molecular VAE + + """ + import lbann + + pad_index = run_args.pad_index + assert pad_index is not None + + sequence_length = run_args.sequence_length + assert sequence_length is not None + + print("sequence length is {}".format(sequence_length)) + data_layout = "data_parallel" + # Layer graph + input_ = lbann.Identity(lbann.Input(name='inp',target_mode="N/A"), name='inp1') + vae_loss= [] + input_feature_dims = sequence_length + + embedding_size = run_args.embedding_dim + dictionary_size = run_args.num_embeddings + assert embedding_size is not None + assert dictionary_size is not None + + kl, recon = molvae.MolVAE(input_feature_dims, + dictionary_size, + embedding_size, + pad_index)(input_) + + vae_loss.append(kl) + vae_loss.append(recon) + print("LEN vae loss ", len(vae_loss)) + + layers = list(lbann.traverse_layer_graph(input_)) + # Setup objective function + weights = set() + for l in layers: + weights.update(l.weights) + l2_reg = lbann.L2WeightRegularization(weights=weights, scale=5e-4) + obj = lbann.ObjectiveFunction(vae_loss) + + # Initialize check metric callback + metrics = [lbann.Metric(kl, name='kl_loss'), + lbann.Metric(recon, name='recon') + ] + + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer()] + + if(run_args.dump_weights_interval > 0): + callbacks.append(lbann.CallbackDumpWeights(directory=run_args.dump_weights_dir, + epoch_interval=run_args.dump_weights_interval)) + if(run_args.ltfb): + send_name = ('' if run_args.weights_to_send == 'All' else run_args.weights_to_send) #hack for Merlin empty string + weights_to_ex = [w.name for w in weights if send_name in w.name] + print("LTFB Weights to exchange ", weights_to_ex) + callbacks.append(lbann.CallbackLTFB(batch_interval=run_args.ltfb_batch_interval,metric='recon', + weights = list2str(weights_to_ex), + low_score_wins=True,exchange_hyperparameters=True)) + # Construct model + return lbann.Model(run_args.num_epochs, + weights=weights, + layers=layers, + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + + +def construct_data_reader(run_args): + """ + Construct Protobuf message for Python data reader. + + The Python data reader will import this Python file to access the + sample access functions. + + """ + + module_file = os.path.abspath(run_args.data_module_file) + os.environ["DATA_CONFIG"] = os.path.abspath(run_args.data_config) + + module_name = os.path.splitext(os.path.basename(module_file))[0] + module_dir = os.path.dirname(module_file) + + print("module_name: {}\tmodule_dir: {}".format(module_name, module_dir)) + + # Base data reader message + message = lbann.reader_pb2.DataReader() + + # Training set data reader + data_reader = message.reader.add() + data_reader.name = "python" + data_reader.role = "train" + data_reader.shuffle = True + data_reader.percent_of_data_to_use = 1.0 + data_reader.validation_percent = 0.1 + data_reader.python.module = module_name + data_reader.python.module_dir = module_dir + data_reader.python.sample_function = "get_sample" + data_reader.python.num_samples_function = "num_samples" + data_reader.python.sample_dims_function = "sample_dims" + + return message + + +def main(): + run_args = construct_lc_launcher_args() + + # add data_config data + # and do not overwrite args if data_reader_prototext is enabled + if os.path.isfile(run_args.data_config) and not run_args.data_reader_prototext: + with open(run_args.data_config, "r") as f: + config = json.load(f) + for k, v in config.items(): + setattr(run_args, k, v) + + trainer = lbann.Trainer( + run_args.batch_size, + name=None, + procs_per_trainer=run_args.procs_per_trainer, + ) + + # define data_reader + if run_args.data_reader_prototext: + print("Using data_reader_prototext") + assert run_args.sequence_length is not None + assert run_args.vocab is not None + + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open(run_args.data_reader_prototext, "r") as f: + txtf.Merge(f.read(), data_reader_proto) + data_reader = data_reader_proto.data_reader + else: + data_reader = construct_data_reader(run_args) + + if "LBANN_EXPERIMENT_DIR" in os.environ: + work_dir = os.environ["LBANN_EXPERIMENT_DIR"] + else: + work_dir = os.path.join(os.getcwd()) + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + experiment_dir = os.path.join( + work_dir, "{}_{}".format(timestamp, run_args.job_name) + ) + if not os.path.exists(experiment_dir): + os.makedirs(experiment_dir) + + # model and optimizer + model = construct_model(run_args) + opt = lbann.Adam(learn_rate=run_args.lr, beta1=0.9, beta2=0.99, eps=1e-8) + + # dump the config to the experiment_dir so that it can be used to load the model in pytorch (moses codebase) + ppn = 4 if run_args.scheduler == "lsf" else 2 + print("args:\n" + str(run_args)) + if(run_args.scheduler == 'slurm'): + import torch + torch.save(run_args, "{}/{}_config.pt".format(experiment_dir, run_args.job_name)) + + m_lbann_args=f"--vocab={run_args.vocab} --data_filedir={run_args.data_filedir} --data_filename_train={run_args.data_filename} --num_samples={run_args.num_samples} --sequence_length={run_args.sequence_length} --num_io_threads={run_args.num_io_threads} --no_header={run_args.no_header} --delimiter={run_args.delimiter}" + if(run_args.data_reader_prototext): + m_lbann_args = " ".join((m_lbann_args, " --use_data_store --preload_data_store ")) + if(run_args.ltfb): + m_lbann_args = " ".join((m_lbann_args, "--ltfb")) + + status = lbann.contrib.launcher.run( + trainer, + model, + data_reader, + opt, + partition=run_args.partition, + scheduler=run_args.scheduler, + #account=run_args.account, + time_limit=run_args.time_limit, + nodes=run_args.nodes, + procs_per_node=ppn, + #batch_job = True, + #setup_only = True, + job_name=run_args.job_name, + experiment_dir=experiment_dir, + lbann_args = m_lbann_args, + environment = { + 'LBANN_USE_CUBLAS_TENSOR_OPS' : 1, + 'LBANN_USE_CUDNN_TENSOR_OPS' : 1, + }, + ) + + print("LBANN launcher status:\n" + str(status)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/applications/ATOM/zinc10k_data_config.json b/applications/ATOM/zinc10k_data_config.json new file mode 100644 index 00000000000..a5432b1ff14 --- /dev/null +++ b/applications/ATOM/zinc10k_data_config.json @@ -0,0 +1,10 @@ +{ + + "pad_index": 27, + "sequence_length": 56, + "max_seq_len": 56, + "data_path" : "/p/gscratchr/brainusr/datasets/zinc/moses_zinc_train10K.npy", + "embedding_dim": 29, + "num_embeddings": 29 + +} diff --git a/applications/MOF/MOFae.py b/applications/MOF/MOFae.py index bfef507aaad..90d235ff11b 100644 --- a/applications/MOF/MOFae.py +++ b/applications/MOF/MOFae.py @@ -7,10 +7,11 @@ # ---------------------------------- def gen_layers(latent_dim, number_of_atoms): ''' Generates the model for the 3D Convolutional Auto Encoder. - - returns the Directed Acyclic Graph (DAG) that the lbann + + returns the Directed Acyclic Graph (DAG) that the lbann model will run on. ''' + input_ = lbann.Input( target_mode = "reconstruction") tensors = lbann.Identity(input_) diff --git a/applications/MOF/README.md b/applications/MOF/README.md index d5444889519..add112bc6b1 100644 --- a/applications/MOF/README.md +++ b/applications/MOF/README.md @@ -35,7 +35,6 @@ python3 -m pytest For more information on the data representation: - @article {Kimeaax9324, author = {Kim, Baekjun and Lee, Sangwon and Kim, Jihan}, title = {Inverse design of porous materials using artificial neural networks}, @@ -49,4 +48,6 @@ For more information on the data representation: eprint = {https://advances.sciencemag.org/content/6/1/eaax9324.full.pdf}, journal = {Science Advances} } - + +The model is based on the work supported by the National Science Foundation under Grant No. DMR-1940243. +Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation diff --git a/applications/graph/GNN/Dense_Graph_Trainer.py b/applications/graph/GNN/Dense_Graph_Trainer.py new file mode 100644 index 00000000000..42a2e0fb473 --- /dev/null +++ b/applications/graph/GNN/Dense_Graph_Trainer.py @@ -0,0 +1,205 @@ +import lbann +from lbann.util import str_list +from lbann.modules.graph import DenseGCNConv, DenseGraphConv + + +def DGCN_layer(feature_matrix,adj_matrix, node_features): + """An example 3 layer GCN kernel. + Args: + feature_matrix (Layer): Node feature layer. Should have the shape: + (num_nodes, node_features) + adj_matrix (Layer): Adjancency matrix layer. Should have the shape: + (num_nodes, num_nodes) + node_features (int): The number of features per node + Returns: + (Layer): Returns the new embedding of the node features + """ + out_channel_1 = 1024 + out_channel_2 = 512 + out_channel_3 = 256 + + gcn1 = DenseGCNConv(input_channels = node_features, output_channels = out_channel_1) + gcn2 = DenseGCNConv(input_channels = out_channel_1, output_channels = out_channel_2) + gcn3 = DenseGCNConv(input_channels = out_channel_2, output_channels = out_channel_3) + + out_channel = out_channel_3 + + x = gcn1(feature_matrix, adj_matrix ) + x = lbann.Relu(x,name="DGCN1_activation") + + x = gcn2(x, adj_matrix) + x = lbann.Relu(x, name="DGCN2_activation") + + x = gcn3 (x, adj_matrix) + x = lbann.Relu(x, name="DGCN3_activation") + return x + + +def DGraph_Layer(feature_matrix,adj_matrix, node_features): + """An example 3 layer Graph kernel. + Args: + feature_matrix (Layer): Node feature layer. Should have the shape: + (num_nodes, node_features) + adj_matrix (Layer): Adjancency matrix layer. Should have the shape: + (num_nodes, num_nodes) + node_features (int): The number of features per node + Returns: + (Layer): Returns the new embedding of the node features + """ + out_channel_1 = 1024 + out_channel_2 = 512 + out_channel_3 = 256 + + gcn1 = DenseGraphConv(input_channels = node_features, output_channels = out_channel_1) + gcn2 = DenseGraphConv(input_channels = out_channel_1, output_channels = out_channel_2) + gcn3 = DenseGraphConv(input_channels = out_channel_2, output_channels = out_channel_3) + + out_channel = out_channel_3 + + x = gcn1(feature_matrix, adj_matrix ) + x = lbann.Relu(x,name="DGraph1_activation") + + x = gcn2(x, adj_matrix) + x = lbann.Relu(x, name="DGraph2_activation") + + x = gcn3 (x, adj_matrix) + x = lbann.Relu(x, name="DGraph3_activation") + return x + + +def make_model(num_vertices = None, + node_features = None, + num_classes = None, + dataset = None, + kernel_type = 'GCN', + callbacks = None, + num_epochs = 1): + '''Construct a model DAG using one of the Graph Kernels + + Args: + num_vertices (int): Number of vertices of each graph (default: None) + node_features (int): Number of features per noded (default: None) + num_classes (int): Number of classes as targets (default: None) + dataset (str): Preset data set to use. Either a datset parameter has to be + supplied or all of num_vertices, node_features, and + num_classes have to be supplied. (default: None) + kernel_type (str): Graph Kernel to use in model. Expected one of + GCN, or Graph (deafult: GCN) + callbacks (list): Callbacks for the model. If set to None the model description, + GPU usage, training_output, and timer is reported. + (default: None) + num_epochs (int): Number of epochs to run (default: 1) + Returns: + (lbann Model Object: A model object with the supplied callbacks, dataset + presets, and graph kernels. + ''' + + assert num_vertices != dataset #Ensure atleast one of the values is set + + if dataset is not None: + assert num_vertices is None + + if dataset == 'MNIST': + num_vertices = 75 + num_classes = 10 + node_features = 1 + + elif dataset == 'PROTEINS': + num_vertices = 100 + num_classes = 2 + node_features = 3 + else: + raise Exception("Unkown Dataset") + + assert num_vertices is not None + assert num_classes is not None + assert node_features is not None + + + #---------------------------------- + # Reshape and Slice Input Tensor + #---------------------------------- + + input_ = lbann.Input(target_mode = 'classification') + + # Input dimensions should be (num_vertices * node_features + num_vertices^2 + num_classes ) + # input should have atleast two children since the target is classification + + sample_dims = num_vertices*node_features + (num_vertices ** 2) + num_classes + graph_dims = num_vertices*node_features + (num_vertices ** 2) + feature_matrix_size = num_vertices * node_features + + graph_input = lbann.Slice(input_, axis = 0 , + slice_points = str_list([0,feature_matrix_size,graph_dims, sample_dims]), + name = "Graph_Input") + + + feature_matrix = lbann.Reshape(graph_input, + dims = str_list([num_vertices, node_features]), + name="Node_features") + + adj_matrix = lbann.Reshape(graph_input, + dims = str_list([num_vertices,num_vertices]), + name="Adj_Mat") + + target = lbann.Identity(graph_input, name="Target") + target = lbann.Reshape(target, dims=str(num_classes)) + + #---------------------------------- + # Perform Graph Convolution + #---------------------------------- + + if kernel_type == 'GCN': + x = DGCN_layer(feature_matrix, adj_matrix, node_features) + elif kernel_type == 'Graph': + x = DGraph_Layer(feature_matrix, adj_matrix, node_features) + else: + ValueError('Invalid Graph kernel specifier "{}" recieved. Expected one of:\ + GCN or Graph'.format(kernel_type)) + out_channel = 256 + #---------------------------------- + # Apply Reduction on Node Features + #---------------------------------- + + average_vector = lbann.Constant(value = 1/num_vertices, num_neurons = str_list([1,num_vertices]), name="Average_Vector") + x = lbann.MatMul(average_vector,x, name="Node_Feature_Reduction") # X is now a vector with output_channel dimensions + + x = lbann.Reshape(x, dims= str_list([out_channel]), name="Squeeze") + x = lbann.FullyConnected(x, num_neurons=256, name="hidden_layer_1") + x = lbann.Relu(x, name="hidden_layer_1_activation") + x = lbann.FullyConnected(x, num_neurons=num_classes, name="Output_Fully_Connected") + + #---------------------------------- + # Loss Function and Accuracy s + #---------------------------------- + + + probs = lbann.Softmax(x, name="Softmax") + loss = lbann.CrossEntropy(probs, target, name="Cross_Entropy_Loss") + accuracy = lbann.CategoricalAccuracy(probs, target, name="Accuracy") + + layers = lbann.traverse_layer_graph(input_) + if callbacks is None: + print_model = lbann.CallbackPrintModelDescription() #Prints initial Model after Setup + training_output = lbann.CallbackPrint( interval = 1, + print_global_stat_only = False) #Prints training progress + gpu_usage = lbann.CallbackGPUMemoryUsage() + timer = lbann.CallbackTimer() + callbacks = [print_model, training_output, gpu_usage, timer] + else: + if isinstance (callbacks, list): + callbacks = callbacks + metrics = [lbann.Metric(accuracy, name='accuracy', unit="%")] + + model = lbann.Model(num_epochs, + layers = layers, + objective_function = loss, + metrics = metrics, + callbacks = callbacks + ) + return model + + +if __name__ == '__main__': + model = make_model(dataset="MNIST") + model = make_model(dataset="MNIST", kernel_type = 'Graph') diff --git a/applications/graph/GNN/README.md b/applications/graph/GNN/README.md new file mode 100644 index 00000000000..1705f0a5e76 --- /dev/null +++ b/applications/graph/GNN/README.md @@ -0,0 +1,48 @@ +## LBANNs Implementation of Graph Convolutional Kernels +This directory contains models which use graph convolution kernels. The graph sub-module in lbann.modules enables +geometric deep learning on LBANN. + +## Datasets +The datasets used to test the graph layers are: + +1. MNIST Superpixel +2. PROTEINS + +To automatically download the MNIST Superpixel dataset: + +``` +cd data/MNIST_Superpixel +python3 MNIST_Superpixel_Dataset.py +``` + +To add self loops and normalize the adjacency matrix, run: + +``` +python3 update_adj_mat.py +``` + +To automatically download the PROTEINS dataset: +``` +cd data/PROTEINS +python3 PROTEINS_Dataset.py +``` + +Note: Both datasets require significant amount of preprocessing post download, so +the download and processing step should be run using the scheduler. + + +## Running Instructions +To run the a model with a graph kernel and a dataset: + +``` +python3 main.py --dataset (Proteins/MNIST) --model (GCN/GIN/GRAPH/GATEDGRAPH) --mini-batch-size MB --num-epochs N + +``` + + +## Links + +- Li, Yujia, et al. "Gated graph sequence neural networks." arXiv preprint arXiv:1511.05493 (2015). +- Kipf, Thomas N., and Max Welling. "Semi-supervised classification with graph convolutional networks." arXiv preprint arXiv:1609.02907 (2016). +- Xu, Keyulu, et al. "How powerful are graph neural networks?." arXiv preprint arXiv:1810.00826 (2018). +- Morris, Christopher, et al. "Weisfeiler and leman go neural: Higher-order graph neural networks." Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 33. 2019. diff --git a/applications/graph/GNN/Sparse_Graph_Trainer.py b/applications/graph/GNN/Sparse_Graph_Trainer.py new file mode 100644 index 00000000000..2107f1ebf2d --- /dev/null +++ b/applications/graph/GNN/Sparse_Graph_Trainer.py @@ -0,0 +1,245 @@ +import lbann +from lbann.util import str_list +from lbann.modules.graph import GINConv, GCNConv, GraphConv, GatedGraphConv +from lbann.modules.graph import GraphVertexData +from graph_data_util import lbann_Graph_Data + +def GINConvLayer(X,A): + """An example GIN kernel with 4 layer deep sequential nn. + Args: + X (GraphVertexData): Contains all the node feaures of the graph + A (Layer): Adjancency matrix layer. Should have the shape: + (num_nodes, num_nodes) + Returns: + (GraphVertexData): Returns the new embedding of the node features + """ + FC = lbann.modules.FullyConnectedModule + sequential_nn = \ + [FC(128), + lbann.Relu, + FC(64), + lbann.Relu, + FC(32), + lbann.Relu, + FC(16), + lbann.Relu] + out_channel = 16 + + gin = GINConv(sequential_nn, output_channels = out_channel) + return gin(X,A) + + +def GCNConvLayer(X,A): + """An example 2-layer GCN kernel. + Args: + X (GraphVertexData): Contains all the node feaures of the graph + A (Layer): Adjancency matrix layer. Should have the shape: + (num_nodes, num_nodes) + Returns: + (GraphVertexData): Returns the new embedding of the node features + """ + input_channels_1 = X.shape[1] + out_channels_1 = 8 + input_channels_2 = out_channels_1 + out_channels_2 = 16 + + gcn_1 = GCNConv(input_channels_1,out_channels_1, + bias = True, + activation = lbann.Relu, + name = 'GCN_1', + data_layout = 'data_parallel') + gcn_2 = GCNConv(input_channels_2,out_channels_2, + bias = True, + activation = lbann.Relu, + name = 'GCN_2', + data_layout = 'data_parallel') + X = gcn_1(X,A) + return gcn_2(X,A) + + +def GraphConvLayer(X,A): + """An example 2-layer Graph kernel. + Args: + X (GraphVertexData): Contains all the node feaures of the graph + A (Layer): Adjancency matrix layer. Should have the shape: + (num_nodes, num_nodes) + Returns: + (GraphVertexData): Returns the new embedding of the node features + """ + input_channels_1 = X.shape[1] + out_channels_1 = 8 + input_channels_2 = out_channels_1 + out_channels_2 = 16 + + graph_1 = GraphConv(input_channels_1, out_channels_1, + bias = True, + activation = lbann.Relu, + name = 'Graph_kernel_1', + data_layout = 'data_parallel') + graph_2 = GraphConv(input_channels_2, out_channels_2, + bias = True, + activation = lbann.Relu, + name = 'Graph_Kernel_2', + data_layout = 'data_parallel') + + X = graph_1(X,A) + return graph_2(X,A) + +def GATConvLayer(X,A): + """An example single layer GatedGraph kernel. + Args: + X (GraphVertexData): Contains all the node feaures of the graph + A (Layer): Adjancency matrix layer. Should have the shape: + (num_nodes, num_nodes) + Returns: + (GraphVertexData): Returns the new embedding of the node features + """ + + output_channels = 8 + num_layers = 3 + name = 'GatedGraph' + data_layout = 'data_parallel' + + graph_kernel = GatedGraphConv(output_channels, + num_layers = num_layers, + name = name, + data_layout = data_layout) + return graph_kernel(X,A) + +def make_model(num_vertices = None, + node_features = None, + num_classes = None, + dataset = None, + kernel_type = 'GCN', + callbacks = None, + num_epochs = 1): + '''Construct a model DAG using one of the Graph Kernels + + Args: + num_vertices (int): Number of vertices of each graph (default: None) + node_features (int): Number of features per noded (default: None) + num_classes (int): Number of classes as targets (default: None) + dataset (str): Preset data set to use. Either a datset parameter has to be + supplied or all of num_vertices, node_features, and + num_classes have to be supplied. (default: None) + kernel_type (str): Graph Kernel to use in model. Expected one of + GCN, GIN, Graph, or GatedGraph (deafult: GCN) + callbacks (list): Callbacks for the model. If set to None the model description, + GPU usage, training_output, and timer is reported. + (default: None) + num_epochs (int): Number of epochs to run (default: 1) + Returns: + (lbann Model Object: A model object with the supplied callbacks, dataset + presets, and graph kernels. + ''' + + assert num_vertices != dataset #Ensure atleast one of the values is set + + if dataset is not None: + assert num_vertices is None + + if dataset == 'MNIST': + num_vertices = 75 + num_classes = 10 + node_features = 1 + + elif dataset == 'PROTEINS': + num_vertices = 100 + num_classes = 2 + node_features = 3 + else: + raise Exception("Unkown Dataset") + + assert num_vertices is not None + assert num_classes is not None + assert node_features is not None + + #---------------------------------- + # Reshape and Slice Input Tensor + #---------------------------------- + + input_ = lbann.Input(target_mode = 'classification') + + # Input dimensions should be (num_vertices * node_features + num_vertices^2 + num_classes ) + # Input should have atleast two children since the target is classification + + data = lbann_Graph_Data(input_,num_vertices, node_features,num_classes) + + feature_matrix = data.x + adj_matrix = data.adj + target = data.y + + #---------------------------------- + # Perform Graph Convolution + #---------------------------------- + + if kernel_type == 'GIN': + x = GINConvLayer(feature_matrix, adj_matrix) + elif kernel_type == 'GCN': + x = GCNConvLayer(feature_matrix, adj_matrix) + elif kernel_type == 'Graph': + x = GraphConvLayer(feature_matrix, adj_matrix) + elif kernel_type == 'GatedGraph': + x = GATConvLayer(feature_matrix, adj_matrix) + else: + ValueError('Invalid Graph kernel specifier "{}" recieved. Expected one of:\ + GIN,GCN,Graph or GatedGraph'.format(kernel_type)) + + out_channel = x.shape[1] + #---------------------------------- + # Apply Reduction on Node Features + #---------------------------------- + + average_vector = lbann.Constant(value = 1/num_vertices, + num_neurons = str_list([1,num_vertices]), + name="Average_Vector") + x = x.get_mat(out_channel) + + x = lbann.MatMul(average_vector,x, name="Node_Feature_Reduction") + + # X is now a vector with output_channel dimensions + + x = lbann.Reshape(x, dims = str_list([out_channel]), name = "Squeeze") + x = lbann.FullyConnected(x, num_neurons = 64, name = "hidden_layer_1") + x = lbann.Relu(x, name = "hidden_layer_1_activation") + x = lbann.FullyConnected(x, num_neurons = num_classes, + name="Output_Fully_Connected") + + #---------------------------------- + # Loss Function and Accuracy s + #---------------------------------- + + + probs = lbann.Softmax(x, name="Softmax") + loss = lbann.CrossEntropy(probs, target, name="Cross_Entropy_Loss") + accuracy = lbann.CategoricalAccuracy(probs, target, name="Accuracy") + + layers = lbann.traverse_layer_graph(input_) + + if callbacks is None: + print_model = lbann.CallbackPrintModelDescription() #Prints initial Model after Setup + training_output = lbann.CallbackPrint( interval = 1, + print_global_stat_only = False) #Prints training progress + gpu_usage = lbann.CallbackGPUMemoryUsage() + timer = lbann.CallbackTimer() + callbacks = [print_model, training_output, gpu_usage, timer] + else: + if isinstance (callbacks, list): + callbacks = callbacks + + metrics = [lbann.Metric(accuracy, name='accuracy', unit="%")] + + model = lbann.Model(num_epochs, + layers = layers, + objective_function = loss, + metrics = metrics, + callbacks = callbacks + ) + return model + +if __name__ == '__main__': + # Quick check to see if model generates correctly + model_1 = make_model(dataset="MNIST", kernel_type = 'GIN') + model_1 = make_model(dataset="MNIST", kernel_type = 'GCN') + model_1 = make_model(dataset="MNIST", kernel_type = 'Graph') + model_1 = make_model(dataset="MNIST", kernel_type = 'GatedGraph') diff --git a/applications/graph/GNN/data/MNIST_Superpixel/__init__.py b/applications/graph/GNN/data/MNIST_Superpixel/__init__.py new file mode 100644 index 00000000000..99301eab715 --- /dev/null +++ b/applications/graph/GNN/data/MNIST_Superpixel/__init__.py @@ -0,0 +1,24 @@ +import urllib.request +import tarfile +import os +import os.path + +import lbann + +data_dir = os.path.dirname(os.path.realpath(__file__)) +def make_data_reader(): #TO DO: Extend this to use this for validation / test set as well after testing + + reader = lbann.reader_pb2.DataReader() + _reader = reader.reader.add() + _reader.name = 'python' + _reader.role = 'train' + _reader.shuffle = False #Turn off shuffle for debugging + _reader.percent_of_data_to_use = 1.0 + _reader.python.module = 'MNIST_Superpixel_Dataset' + _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__)) + _reader.python.sample_function = 'get_train' + _reader.python.num_samples_function = 'num_train_samples' + _reader.python.sample_dims_function = 'sample_dims' + + return reader + diff --git a/applications/graph/GNN/data/MNIST_Superpixel/update_adj_mat.py b/applications/graph/GNN/data/MNIST_Superpixel/update_adj_mat.py new file mode 100644 index 00000000000..31f31123b15 --- /dev/null +++ b/applications/graph/GNN/data/MNIST_Superpixel/update_adj_mat.py @@ -0,0 +1,10 @@ +import numpy as np + +adj_mats = np.load('adj_matrices.npy') + +num_data = adj_mats.shape[0] +for adj in range(num_data): + print(adj, " / ", num_data) + deg_inv_sqrt = (adj_mats[adj].sum(axis=-1).clip(min=1)**(-0.5)).reshape(len(adj_mats[adj]),1) + adj_mats[adj] =deg_inv_sqrt*adj_mats[adj]*deg_inv_sqrt +np.save('adj_matrices.npy', adj_mats) diff --git a/applications/graph/GNN/data/MNIST_Superpixel/utils.py b/applications/graph/GNN/data/MNIST_Superpixel/utils.py new file mode 100644 index 00000000000..bdc0256b464 --- /dev/null +++ b/applications/graph/GNN/data/MNIST_Superpixel/utils.py @@ -0,0 +1,100 @@ +import torch +import urllib.request +import tarfile +import os +import os.path +import numpy as np +import lbann + +data_dir = os.path.dirname(os.path.realpath(__file__)) + +def download_data(): + url = "http://ls7-www.cs.uni-dortmund.de/cvpr_geometric_dl/mnist_superpixels.tar.gz" + training_name = "training.pt" + test_name = "test.pt" + + files = [training_name, test_name] + + for f in files: + data_file = os.path.join(data_dir, f) + + if not os.path.isfile(data_file): #File not in directory + tar_name = os.path.join(data_dir, "mnist_superpixel.tar.gz") + + if not os.path.isfile(tar_name): + urllib.request.urlretrieve(url, filename=tar_name) + extract_data() + else: + extract_data() + +def extract_data(): + tar_name = os.path.join(data_dir, "mnist_superpixel.tar.gz") + print(tar_name) + with tarfile.open(tar_name) as tar: + tar.extractall() + tar.close() +def edge_list_to_dense(elist): + adj_mat = np.zeros((75,75), dtype=np.float) + + ## elist should be of shape (2, num_edges) + + num_edges = elist.size(1) + + for edge in range(num_edges): + source, sink = elist[:,edge] + source = source.item() + sink = sink.item() + adj_mat[source][sink] = 1.0 + adj_mat[sink][source] = 1.0 + + return adj_mat + +def process_training_data(): # Process Training File + train_file_path = os.path.join(data_dir, 'training.pt') + #test_file_path = os.path.join(data_dir, 'test.pt') + + node_features, edge_index, edge_slices, positions, y = torch.load(train_file_path) + + assert y.size(0) == node_features.size(0) + assert y.size(0) == positions.size(0) + assert y.size(0) == 60000 ## + + num_data = 60000 + num_vertices = 75 + # Nodes features should be (60000, 75) + + node_features = np.float32(node_features) + + # Position should be (60000, 75, 2) + + positions = np.float32(positions) + + # Convert edge_index to edge matrix representation with shape (60000, 75, 75) + + adj_matrices = np.zeros( (num_data, num_vertices, num_vertices), dtype=np.float) + + #assert (self.num_data + 1) == edge_slices.size(0), "Expected: {}, Got{} ".format(60001, edge_slices.size(0)) + + for slice_index in range(num_data): + print("{}/{} completed \r".format(slice_index+1, num_data), end='',flush=True) + start_index = edge_slices[slice_index] + end_index = edge_slices[slice_index + 1] + + graph_num = slice_index + elist = edge_index[:, start_index: end_index ] + + adj_matrices[graph_num] = edge_list_to_dense(elist) + + + # Convert y to target with one hot encoding and shape (60000, 10) + + targets = np.zeros ( (num_data, 10), dtype=np.float) + + for i, target in enumerate(y): + print("{}/{} completed".format(i+1, len(y)), end='') + targets[i][target] = 1 + + np.save('node_features.npy',node_features) + np.save('positions.npy',positions) + np.save('adj_matrices.npy', adj_matrices) + np.save('targets.npy',targets) diff --git a/applications/graph/GNN/data/PROTEINS/PROTEINS_Dataset.py b/applications/graph/GNN/data/PROTEINS/PROTEINS_Dataset.py new file mode 100644 index 00000000000..b49103efacc --- /dev/null +++ b/applications/graph/GNN/data/PROTEINS/PROTEINS_Dataset.py @@ -0,0 +1,76 @@ +import numpy as np +import os +import os.path +import sys +import utils +import numpy as np +import sys + +files = ['node_features.npy', 'adj_mats.npy', 'targets.npy'] + +data_dir = os.path.dirname(os.path.realpath(__file__)) + +class PROTEINS_Dataset: + def __init__(self): + # Check is data is downloaded and processed + # Load if data exists + # Else Download and process data + for npy_file in files: + if not os.path.isfile(os.path.join(data_dir,"PROTEINS/"+npy_file)): + self.process_data() + + self.node_features = np.load(os.path.join(data_dir, "PROTEINS/"+files[0])) + self.adjs = np.load(os.path.join(data_dir,"PROTEINS/"+files[1])) + self.targets = np.load(os.path.join(data_dir, "PROTEINS/"+files[2])) + + def generate_dataset(self): + global data_dir + print(data_dir) + data_dir = os.path.join(data_dir, 'PROTEINS') + node_features, adj_mat, targets = utils.TUDataset_Parser(data_dir, 'PROTEINS', 2) + np.save(os.path.join(data_dir, files[0]), node_features) + np.save(os.path.join(data_dir, files[1]), adj_mat) + np.save(os.path.join(data_dir, files[2]), targets) + + def process_data(self): + if not os.path.isfile(os.path.join(data_dir, "PROTEINS.zip")): + #Needs Download + url = 'https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/PROTEINS.zip' + save_path = os.path.join(data_dir, 'PROTEINS.zip') + utils.download_url(url, save_path) + utils.unzip_file(os.path.join(data_dir, "PROTEINS.zip")) + + self.generate_dataset() + + def __len__(self): + + return len(self.node_features) + def __getitem__(self, index): + + x = np.float32(self.node_features[index].flatten()) + y = np.float32(self.targets[index].flatten()) + adj = np.float32(self.adjs[index].flatten()) + + return np.concatenate((x,adj,y), axis=0) + +training_data = PROTEINS_Dataset() + +def get_train(index): + return training_data[index] + +def num_train_samples(): + return len(training_data) + +def sample_dims(): + adjacency_matrix_size = 100 * 100 + node_feature_size = 100 * 3 + target_size = 2 + return (adjacency_matrix_size + node_feature_size + target_size, ) + +if __name__== '__main__': + print(len(training_data)) + print(training_data.node_features[0].shape) + print(training_data.adjs[0].shape) + print(training_data.targets[0].shape) + print(type(training_data[0][0])) + print(sys.getsizeof(training_data[0][0])) diff --git a/applications/graph/GNN/data/PROTEINS/__init__.py b/applications/graph/GNN/data/PROTEINS/__init__.py new file mode 100644 index 00000000000..6204311b693 --- /dev/null +++ b/applications/graph/GNN/data/PROTEINS/__init__.py @@ -0,0 +1,24 @@ +import urllib.request +import tarfile +import os +import os.path + +import lbann + +data_dir = os.path.dirname(os.path.realpath(__file__)) +def make_data_reader(): #TO DO: Extend this to use this for validation / test set as well after testing + + reader = lbann.reader_pb2.DataReader() + _reader = reader.reader.add() + _reader.name = 'python' + _reader.role = 'train' + _reader.shuffle = True #Turn off shuffle for debugging + _reader.percent_of_data_to_use = 1.0 + _reader.python.module = 'PROTEINS_Dataset' + _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__)) + _reader.python.sample_function = 'get_train' + _reader.python.num_samples_function = 'num_train_samples' + _reader.python.sample_dims_function = 'sample_dims' + + return reader + diff --git a/applications/graph/GNN/data/PROTEINS/utils.py b/applications/graph/GNN/data/PROTEINS/utils.py new file mode 100644 index 00000000000..83f18fdc673 --- /dev/null +++ b/applications/graph/GNN/data/PROTEINS/utils.py @@ -0,0 +1,126 @@ +import urllib.request +import tarfile +import zipfile +import os.path +import numpy as np + +def download_url(url, save_path): + with urllib.request.urlopen(url) as dl_file: + with open(save_path, 'wb') as out_file: + out_file.write(dl_file.read()) + +def untar_file(data_dir, file_name): + tar_name = os.path.join(data_dir, file_name) + with tarfile.open(tar_name) as tar: + tar.extractall() + tar.close() +def unzip_file(file_name, data_dir=None): + if (data_dir is None): + data_dir = os.path.dirname(file_name) + + with zipfile.ZipFile(file_name, 'r') as zip_ref: + zip_ref.extractall(data_dir) + +def edge_list_to_dense(elist, num_vertices = 75): + adj_mat = np.zeros((num_vertices,num_vertices), dtype=np.float) + num_edges = elist.shape[0] + for edge in range(num_edges): + source, sink = elist[edge,:] + source = source.item() + sink = sink.item() + adj_mat[source][sink] = 1.0 + adj_mat[sink][source] = 1.0 + return adj_mat + + +######################################################## +# +# TU Dataset specific functions +# +######################################################## + +def extract_node_features(node_slices, node_labels, max_nodes, num_classes = None): + node_label_list = [] + for i, ind in enumerate(node_slices[1:]): + if num_classes: + graph_x = np.eye(num_classes)[np.asarray([int(x) for x in node_labels[node_slices[i]:ind]],dtype=np.int)] + else: + graph_x = anp.asarray([int(x) for x in node_labels[node_slices[i]:ind]],dtype=np.int) + if (len(graph_x) < max_nodes): + pad = max_nodes - len(graph_x) + graph_x = np.pad(graph_x, ((0,pad),(0,0)), 'constant') + node_label_list.append(graph_x) + return node_label_list + + +def extract_adj_mat(node_slices, edge_list, max_nodes): + adj_mat_list = [] + removed_graphs = [] + for i, max_node_id in enumerate(node_slices[1:]): + min_node_id = node_slices[i] + num_nodes = max_node_id - min_node_id + if (num_nodes < max_nodes): + edges = edge_list[(edge_list[:,1] > min_node_id) & (edge_list[:,1] < max_node_id)] + edges = edges -1 - min_node_id + adj_mat = edge_list_to_dense(edges, max_nodes) + adj_mat_list.append(adj_mat) + else: + removed_graphs.append(i) + + return adj_mat_list, removed_graphs + +def extract_targets(graph_labels, num_classes, removed_graphs): + graph_labels = np.array([int(x) for x in graph_labels]) + labels = np.eye(num_classes)[graph_labels-1] + graph_labels = np.delete(labels, removed_graphs, axis=0) + return graph_labels + +def dataset_node_slices(graph_indicator_list, num_graphs): + node_slices = [] + + prev = 0 + for i in range(num_graphs+1): + node_slices.append(prev+graph_indicator_list.count(str(i))) + prev = prev + graph_indicator_list.count(str(i)) + return node_slices + +def TUDataset_Parser(data_dir, dataset_name, num_classes): + + adj_file = open(os.path.join(data_dir, dataset_name + '_A.txt'), 'r') + graph_labels_file = open(os.path.join( data_dir, dataset_name + '_graph_labels.txt'), 'r') + graph_ind_file = open(os.path.join( data_dir, dataset_name + '_graph_indicator.txt'), 'r') + node_attr_file = open(os.path.join( data_dir, dataset_name + '_node_attributes.txt'), 'r') + node_labels_file = open(os.path.join( data_dir, dataset_name + '_node_labels.txt'), 'r') + + graph_labels = graph_labels_file.read().rstrip().split('\n') + graph_ind = graph_ind_file.read().rstrip().split('\n') + node_attr = node_attr_file.read().rstrip().split('\n') + adj_list = adj_file.read().rstrip().split('\n') + node_labels = node_labels_file.read().rstrip().split('\n') + + NUM_GRAPHS = len(graph_labels) + NUM_NODES = len(node_attr) + NUM_EDGES = len(adj_list) + + adj_file.close() + graph_labels_file.close() + graph_ind_file.close() + node_attr_file.close() + node_labels_file.close() + edge_list = [] + for edge in adj_list: + edge = np.array([int(x) for x in edge.split(',')]) + edge_list.append(edge) + + edge_list = np.array(edge_list) + + node_slices = dataset_node_slices(graph_ind, NUM_GRAPHS) + + max_nodes = 100 + adj_mat, removed_graphs = extract_adj_mat(node_slices, edge_list, max_nodes) + num_features = 3 + node_features = extract_node_features(node_slices, node_labels,max_nodes, num_features) + node_features = np.array(node_features) + targets = extract_targets(graph_labels, num_classes, removed_graphs) + + return node_features, adj_mat, targets diff --git a/applications/graph/GNN/data/__init__.py b/applications/graph/GNN/data/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/applications/graph/GNN/main.py b/applications/graph/GNN/main.py new file mode 100644 index 00000000000..cebf7c679a7 --- /dev/null +++ b/applications/graph/GNN/main.py @@ -0,0 +1,91 @@ +import lbann +import lbann.contrib.launcher +import lbann.contrib.args + +import argparse +import os + +import Sparse_Graph_Trainer +import Dense_Graph_Trainer +import data.MNIST_Superpixel +import data.PROTEINS + +desc = (" Training a Graph Convolutional Model using LBANN" ) + +parser = argparse.ArgumentParser(description=desc) + +lbann.contrib.args.add_scheduler_arguments(parser) +lbann.contrib.args.add_optimizer_arguments(parser) + +parser.add_argument( + '--num-epochs', action='store', default=100, type=int, + help='number of epochs (deafult: 100)', metavar='NUM') + +parser.add_argument( + '--mini-batch-size', action='store',default=32, type=int, + help="mini-batch size (default: 32)", metavar='NUM') + +parser.add_argument( + '--dataset', action='store', default='MNIST', type=str, + help="Dataset for model (default: MNIST)", metavar='NAME') + +parser.add_argument( + '--job-name', action='store', default="GCN_TEST", type=str, + help="Job name for scheduler", metavar='NAME') + +parser.add_argument( + '--model', action = 'store', default='GCN', type=str, + help="The type of model to use", metavar='NAME') + +args = parser.parse_args() + + +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) + +dataset = args.dataset +num_epochs = args.num_epochs +mini_batch_size = args.mini_batch_size +job_name = args.job_name +model_arch = args.model + + +## Get Model + +if (model_arch == 'GRAPH'): + model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS', + kernel_type = 'Graph', + num_epochs = num_epochs) +elif(model_arch=='GIN'): + model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS', + kernel_type = 'GIN', + num_epochs = num_epochs) +elif(model_arch=='GATEDGRAPH'): + model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS', + kernel_type = 'GatedGraph', + num_epochs = num_epochs) +elif (model_arch =='DGCN'): + model = Dense_Graph_Trainer.make_model(dataset = 'PROTEINS', + kernel_type = 'GCN', + num_epochs = num_epochs) +elif (model_arch == 'DGRAPH'): + model = Dense_Graph_Trainer.make_model(dataset = 'PROTEINS', + kernel_type = 'Graph', + num_epochs = num_epochs) +else: + model = Sparse_Graph_Trainer.make_model(dataset = 'PROTEINS', + kernel_type = 'GCN', + num_epochs=num_epochs) + + +optimizer = lbann.SGD(learn_rate = 1e-3) + +#add logic for choosing a dataset + +data_reader = data.PROTEINS.make_data_reader() + +trainer = lbann.Trainer(mini_batch_size = mini_batch_size) + + +lbann.contrib.launcher.run(trainer, model, data_reader, optimizer, + job_name = job_name, + **kwargs) diff --git a/applications/graph/GNN/test/__init__.py b/applications/graph/GNN/test/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/applications/graph/GNN/test/conftest.py b/applications/graph/GNN/test/conftest.py new file mode 100644 index 00000000000..0179df3687b --- /dev/null +++ b/applications/graph/GNN/test/conftest.py @@ -0,0 +1,41 @@ +import sys +sys.path.insert(0, '../../../../bamboo/common_python') +import tools +import pytest, re, subprocess + + +def pytest_addoption(parser): + cluster = re.sub('[0-9]+', '', subprocess.check_output( + 'hostname'.split()).decode('utf-8').strip()) + default_dirname = subprocess.check_output( + 'git rev-parse --show-toplevel'.split()).decode('utf-8').strip() + default_exes = tools.get_default_exes(default_dirname, cluster) + + parser.addoption('--cluster', action='store', default=cluster, + help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') + parser.addoption('--dirname', action='store', default=default_dirname, + help='--dirname= to specify the top-level directory. Default directory of build_lbann_lc executable') + parser.addoption('--exes', action='store', default=default_exes, + help='--exes={compiler_name: path}') + parser.addoption('--weekly', action='store_true', default=False, + help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False') + + +@pytest.fixture +def cluster(request): + return request.config.getoption('--cluster') + + +@pytest.fixture +def dirname(request): + return request.config.getoption('--dirname') + + +@pytest.fixture +def exes(request): + return request.config.getoption('--exes') + + +@pytest.fixture +def weekly(request): + return request.config.getoption('--weekly') diff --git a/applications/graph/GNN/test/test_integration_DGCN.py b/applications/graph/GNN/test/test_integration_DGCN.py new file mode 100644 index 00000000000..b94489483fd --- /dev/null +++ b/applications/graph/GNN/test/test_integration_DGCN.py @@ -0,0 +1,153 @@ +import functools +import operator +import os +import os.path +import re +import sys +import pytest +import lbann + +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +root_dir = os.path.dirname(current_dir) + +import data.PROTEINS +import Dense_Graph_Trainer + +graph_dir = os.path.dirname(root_dir) +applications_dir = os.path.dirname(graph_dir) +lbann_dir = os.path.dirname(applications_dir) +common_python_dir = os.path.join(lbann_dir, 'bamboo/common_python')# Added lbann/bamboo/common_python +sys.path.append(common_python_dir) +import tools + + +num_epochs = 30 +mini_batch_size = 64 +num_nodes = 2 + + +expected_accuracy_range = (64, 71) + +expected_mini_batch_times = { + 'ray' : 0.005 + } +expected_gpu_usage = { + 'ray' : 0.7 + } + +def setup_experiment(lbann): + """Construct LBANN experiment. + + args: + lbann (module): Module for LBANN Python frontend + + """ + + + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + + + + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackGPUMemoryUsage()] + + + + model = Dense_Graph_Trainer.make_model(dataset = 'PROTEINS', + kernel_type = 'GCN', + num_epochs = num_epochs, + callbacks = callbacks) + reader = data.PROTEINS.make_data_reader() + + # No validation set + + optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 ) + return trainer, model, reader, optimizer + +# ============================================== +# Setup PyTest +# ============================================== + +def augment_test_func(test_func): + """Augment test function to parse log files. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + test_name = test_func.__name__ + + # Define test function + def func(cluster, exes, dirname): + # Run LBANN experiment + experiment_output = test_func(cluster, exes, dirname) + + # Parse LBANN log file + train_accuracy = None + gpu_usage = None + mini_batch_times = [] + gpu_usages = [] + + with open(experiment_output['stdout_log_file']) as f: + for line in f: + match = re.search('training epoch [0-9]+ accuracy : ([0-9.]+)%', line) + if match: + train_accuracy = float(match.group(1)) + match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line) + if match: + mini_batch_times.append(float(match.group(1))) + match = re.search('GPU memory usage statistics : ([0-9.]+) GiB mean', line) + if match: + gpu_usages.append(float(match.group(1))) + + # Check if training accuracy is within expected range + assert (expected_accuracy_range[0] + < train_accuracy + 0) : + callbacks.append(lbann.CallbackLTFB(batch_interval=args.ltfb_batch_interval,metric='recon_error', + low_score_wins=True, + exchange_hyperparameters=True)) + # Construct model return lbann.Model(args.num_epochs, serialize_io=True, @@ -160,7 +171,7 @@ def construct_model(): nodes=args.num_nodes, procs_per_node=args.ppn, time_limit=720, - setup_only=False, + setup_only=True, job_name=args.job_name, lbann_args=['--use_data_store --preload_data_store', f'--metadata={metadata_prototext}', diff --git a/applications/physics/ICF/train_macc_surrogate.py b/applications/physics/ICF/train_macc_surrogate.py index 396465d5838..82c072c5486 100644 --- a/applications/physics/ICF/train_macc_surrogate.py +++ b/applications/physics/ICF/train_macc_surrogate.py @@ -52,6 +52,12 @@ parser.add_argument( '--xdim', action='store', default=5, type=int, help='input (x) dim (default: 5)', metavar='NUM') +parser.add_argument( + '--wae_mcf', action='store', default=1, type=int, + help='model capacity factor (default: 1)', metavar='NUM') +parser.add_argument( + '--surrogate_mcf', action='store', default=1, type=int, + help='model capacity factor (default: 1)', metavar='NUM') parser.add_argument( '--lamda-cyc', action='store', default=1e-3, type=float, help='lamda-cyc (default: 1e-3)', metavar='NUM') @@ -82,11 +88,13 @@ parser.add_argument( '--procs-per-trainer', action='store', default=0, type=int, help='processes per trainer (default: 0)', metavar='NUM') +parser.add_argument( + '--ltfb-batch-interval', action='store', default=0, type=int, + help='LTFB batch interval (default: 0, no LTFB)', metavar='NUM') args = parser.parse_args() if not(args.pretrained_dir): - print("WARNING pretrained dir ", args.pretrained_dir, " is empty, default option assumes - pretrained autoencoder") + print("WARNING pretrained dir ", args.pretrained_dir, " is empty, default option assumes pretrained autoencoder") def list2str(l): return ' '.join(l) @@ -111,9 +119,9 @@ def construct_model(): z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20") - wae = macc_models.MACCWAE(args.zdim,args.ydim,use_CNN=args.useCNN) #pretrained, freeze - inv = macc_models.MACCInverse(args.xdim) - fwd = macc_models.MACCForward(args.zdim) + wae = macc_models.MACCWAE(args.zdim,args.ydim,cf=args.wae_mcf,use_CNN=args.useCNN) #pretrained, freeze + inv = macc_models.MACCInverse(args.xdim,cf=args.surrogate_mcf) + fwd = macc_models.MACCForward(args.zdim,cf=args.surrogate_mcf) y_pred_fwd = wae.encoder(gt_y) @@ -175,6 +183,10 @@ def construct_model(): lbann.CallbackLoadModel(dirs=str(args.pretrained_dir)), lbann.CallbackTimer()] + if(args.ltfb_batch_interval > 0) : + callbacks.append(lbann.CallbackLTFB(batch_interval=args.ltfb_batch_interval,metric='fw_loss', + low_score_wins=True, + exchange_hyperparameters=True)) # Construct model return lbann.Model(args.num_epochs, weights=weights, diff --git a/applications/physics/data/jag_100M_metadata.prototext b/applications/physics/data/jag_100M_metadata.prototext index 7e22e71f0a9..de258021039 100644 --- a/applications/physics/data/jag_100M_metadata.prototext +++ b/applications/physics/data/jag_100M_metadata.prototext @@ -24,8 +24,8 @@ data_set_metadata { image_height: 64 image_num_channels: 4 - #jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] #3 views - jag_image_keys: ["(0.0, 0.0)/0.0/emi"] #1 view, default + jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] #3 views + #jag_image_keys: ["(0.0, 0.0)/0.0/emi"] #1 view, default scalar_prefix: "/outputs/scalars/" diff --git a/bamboo/common_python/data/imagenet/data_reader.prototext b/bamboo/common_python/data/imagenet/data_reader.prototext index 3f4e0270f3f..08ddf8b8161 100644 --- a/bamboo/common_python/data/imagenet/data_reader.prototext +++ b/bamboo/common_python/data/imagenet/data_reader.prototext @@ -3,9 +3,11 @@ data_reader { name: "imagenet" role: "train" shuffle: true - data_filedir: "path/to/ILSVRC2012/train" - data_filename: "path/to/ILSVRC2012/labels/train.txt" + data_filedir: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/train/" + data_filename: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/labels/train.txt" + label_filename: "" validation_percent: 0.0 + absolute_sample_count: 0 percent_of_data_to_use: 1.0 num_labels: 1000 @@ -34,8 +36,11 @@ data_reader { reader { name: "imagenet" role: "validate" - data_filedir: "path/to/ILSVRC2012/val" - data_filename: "path/to/ILSVRC2012/labels/val.txt" + shuffle: true + data_filedir: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/val/" + data_filename: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/labels/val.txt" + label_filename: "" + absolute_sample_count: 0 percent_of_data_to_use: 1.0 num_labels: 1000 diff --git a/bamboo/compiler_tests/build_script.sh b/bamboo/compiler_tests/build_script.sh index 1ecdc393b57..333b99b3f41 100755 --- a/bamboo/compiler_tests/build_script.sh +++ b/bamboo/compiler_tests/build_script.sh @@ -23,7 +23,7 @@ MPI_DIR=${COMPILER_DIR}/${MPI_LIBRARY} # most are MPI-independent). DEPENDENCY_DIR=${MPI_DIR} -export CMAKE_PREFIX_PATH=${COMMON_DEPENDENCY_DIR}/catch2:${COMMON_DEPENDENCY_DIR}/cereal:${COMMON_DEPENDENCY_DIR}/clara:${COMMON_DEPENDENCY_DIR}/cub:${COMMON_DEPENDENCY_DIR}/half:${DEPENDENCY_DIR}/aluminum:${DEPENDENCY_DIR}/cnpy:${DEPENDENCY_DIR}/conduit:${DEPENDENCY_DIR}/hdf5:${DEPENDENCY_DIR}/hydrogen:${DEPENDENCY_DIR}/jpeg-turbo:${DEPENDENCY_DIR}/nccl:${DEPENDENCY_DIR}/openblas:${DEPENDENCY_DIR}/opencv:${DEPENDENCY_DIR}/protobuf:${CMAKE_PREFIX_PATH} +export CMAKE_PREFIX_PATH=${COMMON_DEPENDENCY_DIR}/catch2:${COMMON_DEPENDENCY_DIR}/cereal:${COMMON_DEPENDENCY_DIR}/clara:${COMMON_DEPENDENCY_DIR}/cub:${COMMON_DEPENDENCY_DIR}/half:${DEPENDENCY_DIR}/aluminum-0.4.0:${DEPENDENCY_DIR}/cnpy:${DEPENDENCY_DIR}/conduit:${DEPENDENCY_DIR}/hdf5:${DEPENDENCY_DIR}/hydrogen-1.4.0:${DEPENDENCY_DIR}/jpeg-turbo:${DEPENDENCY_DIR}/nccl:${DEPENDENCY_DIR}/openblas:${DEPENDENCY_DIR}/opencv:${DEPENDENCY_DIR}/protobuf:${CMAKE_PREFIX_PATH} if [ -e ${DEPENDENCY_DIR} ]; then diff --git a/bamboo/integration_tests/test_integration_alexnet.py b/bamboo/integration_tests/test_integration_alexnet.py index 576b2852204..d54954240e2 100644 --- a/bamboo/integration_tests/test_integration_alexnet.py +++ b/bamboo/integration_tests/test_integration_alexnet.py @@ -186,5 +186,6 @@ def func(cluster, exes, dirname, weekly): # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, - nodes=num_nodes): + nodes=num_nodes, + lbann_args=['--load_full_sample_list_once']): globals()[_test_func.__name__] = augment_test_func(_test_func) diff --git a/bamboo/integration_tests/test_integration_resnet50.py b/bamboo/integration_tests/test_integration_resnet50.py index 360e3fb20e1..77e629bf4d9 100644 --- a/bamboo/integration_tests/test_integration_resnet50.py +++ b/bamboo/integration_tests/test_integration_resnet50.py @@ -184,5 +184,6 @@ def func(cluster, exes, dirname, weekly): # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, - nodes=num_nodes): + nodes=num_nodes, + lbann_args=['--load_full_sample_list_once']): globals()[_test_func.__name__] = augment_test_func(_test_func) diff --git a/bamboo/unit_tests/test_unit_layer_batch_normalization.py b/bamboo/unit_tests/test_unit_layer_batch_normalization.py new file mode 100644 index 00000000000..cbc3de3230d --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_batch_normalization.py @@ -0,0 +1,179 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20200827) +_num_samples = 29 +_sample_dims = (7,5,3) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: We want to use gradient checking to verify that error + # signals are correct. To do this, we zero-initialize a weights + # object, construct a zero-valued tensor, and add it to the + # input. To make sure that batchnorm is non-trivial, we multiply + # the zero-valued tensor by the mini-batch index. + x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x0 = lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims)) + x1 = lbann.Divide(lbann.MiniBatchIndex(), lbann.MiniBatchSize()) + x1 = lbann.Tessellate(lbann.Reshape(x1, dims='1 1 1'), dims=tools.str_list(_sample_dims)) + x = lbann.Sum(x, lbann.Multiply(x0, x1)) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Local statistics + # ------------------------------------------ + + # LBANN implementation + decay = 0.9 + epsilon = 1e-5 + x = x_lbann + y = lbann.BatchNormalization(x, + decay=decay, + epsilon=epsilon, + scale_init=1.5, + bias_init=0.25, + data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='local statistics')) + + # ------------------------------------------ + # Global statistics + # ------------------------------------------ + + # LBANN implementation + decay = 0.9 + epsilon = 1e-5 + x = x_lbann + y = lbann.BatchNormalization(x, + decay=decay, + epsilon=epsilon, + scale_init=0.8, + bias_init=-0.25, + statistics_group_size=-1, + data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='global statistics')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 1 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_gru.py b/bamboo/unit_tests/test_unit_layer_gru.py new file mode 100644 index 00000000000..81c35f5a664 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_gru.py @@ -0,0 +1,236 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np +import scipy.special + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20200909) +_num_samples = 15 +_sequence_length = 5 +_input_size = 13 +_hidden_size = 7 +_sample_size = _sequence_length*_input_size + _hidden_size +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy implementation +# ============================================== + +def numpy_gru(x, h, ih_matrix, hh_matrix, ih_bias, hh_bias): + + # Cast inputs to float64 + if x.dtype is not np.float64: + x = x.astype(np.float64) + if h.dtype is not np.float64: + h = h.astype(np.float64) + if ih_matrix.dtype is not np.float64: + ih_matrix = ih_matrix.astype(np.float64) + if hh_matrix.dtype is not np.float64: + hh_matrix = hh_matrix.astype(np.float64) + if ih_bias.dtype is not np.float64: + ih_bias = ih_bias.astype(np.float64) + if hh_bias.dtype is not np.float64: + hh_bias = hh_bias.astype(np.float64) + + # Dimensions + sequence_length, input_size = x.shape + hidden_size = h.shape[0] + + # Unroll GRU + y = [] + for t in range(sequence_length): + ih = np.matmul(ih_matrix, x[t]) + ih_bias + hh = np.matmul(hh_matrix, h) + hh_bias + r = scipy.special.expit(ih[:hidden_size] + hh[:hidden_size]) + z = scipy.special.expit(ih[hidden_size:2*hidden_size] + hh[hidden_size:2*hidden_size]) + n = np.tanh(ih[2*hidden_size:] + r*hh[2*hidden_size:]) + h = (1-z)*n + z*h + y.append(h) + return np.stack(y) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.SGD() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), + name='input') + h_weights = lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), + name='inital_hidden') + input_ = lbann.Identity(lbann.Input()) + input_slice = lbann.Slice( + input_, + slice_points=tools.str_list([0, _sequence_length*_input_size, _sample_size]), + ) + x = lbann.Reshape(input_slice, dims=tools.str_list([_sequence_length,_input_size])) + x = lbann.Sum(x, lbann.WeightsLayer(weights=x_weights, hint_layer=x)) + h = lbann.Reshape(input_slice, dims=tools.str_list([_hidden_size]),) + h = lbann.Sum(h, lbann.WeightsLayer(weights=h_weights, hint_layer=h)) + x_lbann = x + h_lbann = h + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # 1-layer, uni-directional GRU + # ------------------------------------------ + + # Weights + ih_matrix = np.random.normal(size=(3*_hidden_size,_input_size)).astype(np.float32) + hh_matrix = np.random.normal(size=(3*_hidden_size,_hidden_size)).astype(np.float32) + ih_bias = np.random.normal(size=(3*_hidden_size,)).astype(np.float32) + hh_bias = np.random.normal(size=(3*_hidden_size,)).astype(np.float32) + ih_matrix_weights = lbann.Weights( + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(ih_matrix, order='F')))) + hh_matrix_weights = lbann.Weights( + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(hh_matrix, order='F')))) + ih_bias_weights = lbann.Weights( + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(ih_bias)))) + hh_bias_weights = lbann.Weights( + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(hh_bias)))) + + # LBANN implementation + x = x_lbann + h = h_lbann + y = lbann.GRU( + x, + h, + hidden_size=_hidden_size, + weights=[ih_matrix_weights,hh_matrix_weights,ih_bias_weights,hh_bias_weights], + ) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='1-layer, unidirectional')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + input_ = get_sample(i).astype(np.float64) + x = input_[:_sequence_length*_input_size].reshape((_sequence_length,_input_size)) + h = input_[_sequence_length*_input_size:] + y = numpy_gru(x, h, ih_matrix, hh_matrix, ih_bias, hh_bias) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# for test in tools.create_tests(setup_experiment, __file__): +# globals()[test.__name__] = test diff --git a/cmake/modules/FindPython.cmake b/cmake/modules/FindPython.cmake index 39d5430461e..f30534e61c2 100644 --- a/cmake/modules/FindPython.cmake +++ b/cmake/modules/FindPython.cmake @@ -83,7 +83,7 @@ include(FindPackageHandleStandardArgs) find_package_handle_standard_args( Python REQUIRED_VARS Python_EXECUTABLE Python_INCLUDE_DIRS Python_LIBRARIES - Python_VERSION_MAJOR Python_VERSION_MINOR Python_VERSION_PATCH + Python_VERSION VERSION_VAR Python_VERSION) # Build the imported target diff --git a/docs/build_llnl_idiosyncracies.rst b/docs/build_llnl_idiosyncracies.rst index f5f3836ef0f..c94b4d7792d 100644 --- a/docs/build_llnl_idiosyncracies.rst +++ b/docs/build_llnl_idiosyncracies.rst @@ -24,7 +24,7 @@ this guide. Pre-installed Binary Packages ------------------------------ - +.. comment:: need to update this section with newer versions or perhaps remove if this method is no longer used by developers The LC machines have many instances of cuDNN and NCCL installed in locations shared by the :code:`brain` group. These may be consistently detected by CMake by :code:`export`-ing their locations into the diff --git a/docs/building_lbann.rst b/docs/building_lbann.rst index c3053300fd4..7a29790af17 100644 --- a/docs/building_lbann.rst +++ b/docs/building_lbann.rst @@ -137,8 +137,9 @@ CMake flags known to LBANN's "Superbuild" build system. :code:`-e` flag. A full list of options can be viewed with the :code:`-h` flag. -2. Setup the LBANN CMake environment using the Spack environment for - the dependencies. +2. Setup the LBANN CMake environment using the Spack environment for the + dependencies. If you used a custom Spack environment name in the step + above, be sure to specify that with the :code:`-e` option: .. code-block:: bash diff --git a/docs/callbacks.rst b/docs/callbacks.rst new file mode 100644 index 00000000000..bfb490ebc2b --- /dev/null +++ b/docs/callbacks.rst @@ -0,0 +1,123 @@ +.. role:: python(code) + :language: python + +.. _callbacks: + +============================================================ +Callbacks +============================================================ + +LBANN has numerous callbacks that can be used to collect +data about an experiment, such as scalars, metrics, weights, +memory usage, images, etc. The documentation of many of these +is pending; see the :ref:`list of Available +Callbacks` for the documented ones. + +The callbacks are set to execute at various times, and can be +used to display images according to either a boolean output or +their global sample index. + +For a complete listing of callbacks and details about their +functionality, please see :ref:`Available +Callbacks`. + +.. _using-callbacks: + +------------------------------------------------ +Using Callbacks +------------------------------------------------ + +Callbacks are used by adding them to the python front end with the +appropriate arguments and passing them as a list into the model. +For example, the callbacks timer, print_statistics, and save model +could be included with the following: + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Python Front End +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + timer = lbann.CallbackTimer() + print_stats = lbann.CallbackPrintStatistics( + batch_interval=5) + save_model = lbann.CallbackSaveModel( + dir=".", + disable_save_after_training=True) + + callbacks = [timer, + print_stats, + save_model] + + model = lbann.Model(num_epochs, + layers, + objective_function, + metrics, + callbacks) + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Profobuf (Advanced) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: + + callback { + timer { + } + print_atatistics { + batch_interval: 5 + } + save_model { + dir: "." + disable_save_after_training: true + } + } + +.. _available-callbacks: + +------------------------------------------------ +Available Callbacks +------------------------------------------------ + +.. toctree:: + :maxdepth: 1 + + Check dataset + Check gradients + Check init + Check metric + Check nan in activation values + Check matrices in small values + Checkpoint + Confusion matrix + Debug + Debug io + Dump error signals + Dump gradients + Dump minibatch sample indices + Dump outputs + Dump weights + Early stopping + Gpu memory usage + Hang + Imcomm + Learning rate + Load model + Ltfb + Mixup + Monitor io + Perturb adam + Perturb dropout + Print model description + Print statistics + Profiler + Replace weights + Save images + Save model + Save topk models + Summarize images + Summary + Sync layers + Timeline + Timer + Variable minibatch diff --git a/docs/callbacks/selection_strategy/categorical_accuracy_strategy.rst b/docs/callbacks/selection_strategy/categorical_accuracy_strategy.rst new file mode 100644 index 00000000000..f575997d991 --- /dev/null +++ b/docs/callbacks/selection_strategy/categorical_accuracy_strategy.rst @@ -0,0 +1,59 @@ +.. role:: python(code) + :language: python + +============================== +Categorical Accuracy Strategy +============================== + +---------- +Summary +---------- + +The :python:`CategoricalAccuracyStrategy` is used to view a snapshot +of images in the dataset being used in the training session that match +a boolean criterion. To simplify things in the model construction, +this strategy can print images whose output is :python:`true`, images +whose output is :python:`false`, or all images. A canonical use-case +is to print the images that are (in)correctly categorized by a +classification model. The number of images output is limited by a +user-provided parameter or until no more matches are found. + +.. note:: The name of this class erroneously suggests a rather narrow + use-case. We are looking to change the name in a future + release of LBANN. In fact, this strategy can take as input + any boolean layer, not just categorical accuracy layers. + +---------- +Arguments +---------- + ++ :python:`categorical_accuracy_layer_name` (string): The name of the + boolean layer to be used to determine matches. A Python Front-End + layer's name can be accessed via the :python:`name` attribute. A + common use-case is the name of a :python:`CategoricalAccuracy` layer + that has been added to a model. + ++ :python:`match_type` + (:python:`lbann.CategoricalAccuracyStrategy.MatchType`): Criterion for + selecting images to output. Possible values are: + + ================= ======================================================= + :python:`NOMATCH` Output images corresponding to :python:`false` values. + :python:`MATCH` Output images corresponding to :python:`true` values. + :python:`ALL` Output all images. + ================= ======================================================= + + The default value is :python:`NOMATCH`. + ++ :python:`num_images_per_epoch` (uint): The maximum number of images to + output per epoch. The default value is 10. + +---------- +Usage +---------- + +See the :ref:`usage example` as part of +the :doc:`CallbackSummarizeImages ` +documentation. + + diff --git a/docs/callbacks/selection_strategy/track_sample_ids_strategy.rst b/docs/callbacks/selection_strategy/track_sample_ids_strategy.rst new file mode 100644 index 00000000000..252c16fc379 --- /dev/null +++ b/docs/callbacks/selection_strategy/track_sample_ids_strategy.rst @@ -0,0 +1,41 @@ +.. role:: python(code) + :language: python + +============================== +Track Sample IDs Strategy +============================== + +---------- +Summary +---------- + +The :python:`TrackSampleIDsStrategy` selection strategy is used by +:python:`CallbackSummarizeImages` to output a constant set of images +over the duration of a training run of LBANN. Use of this strategy is +ideally suited to generative applications, as it allows users to +visualize the ability of a network to reproduce the same image over +time. + +---------- +Arguments +---------- + ++ :python:`input_layer_name`: the name of the input layer with the + original images. For reasons inherent to the C++ code, this must be + an :python:`Input` layer. A Python Front-End layer's name can be + accessed via the :python:`name` attribute. + ++ :python:`num_tracked_images`: the number of images to track. If + unset, 10 images will be tracked. This is a proxy for the user + specifying images to track based on some unique identifier. We are + considering methods to expose this functionality; this is work in + progress. + +---------- +Usage +---------- + +See the :ref:`usage example` as part of +the :doc:`CallbackSummarizeImages ` +documentation. + diff --git a/docs/callbacks/summarize_images.rst b/docs/callbacks/summarize_images.rst new file mode 100644 index 00000000000..5b473175253 --- /dev/null +++ b/docs/callbacks/summarize_images.rst @@ -0,0 +1,180 @@ +.. role:: python(code) + :language: python + +.. role:: c(code) + :language: c + +.. _summarize-images-callback: + +============================================================ +Summarize Images Callback +============================================================ + +The purpose of this callback is to output images into an event file at +the end of each epoch, according to the specified intervals. The +images in the event file are displayed using `Tensorboard +`_. This callback could be +used, for example, to display categorized images or images generated +by an autoencoder or by a GAN. + +The method of selecting images, and the layers from which images are +displayed, can be controlled via the :python:`selection_strategy` +argument to the callback. Images that match some boolean value may be +selected with :python:`CategoricalAccuracyStrategy`. A canonical +example of this would be to output images that are classified +incorrectly by a classification network. Alternatively, a fixed number +of images can be displayed using +:python:`TrackSampleIDsStrategy`. This may be used, for example, to +visualize the progress in training a GAN or an autoencoder. + +--------------------------------------------- +Execution Points +--------------------------------------------- + ++ After each testing/validation minibatch + +--------------------------------------------- +Callback Arguments (Python Front-End) +--------------------------------------------- + ++ :python:`selection_strategy`: The image selection + strategy. Currently supported options are: + + - :doc:`TrackSampleIDsStrategy ` + - :doc:`CategoricalAccuracyStrategy ` + ++ :python:`image_source_layer_name`: The name of the layer from which + images will be pulled. A Python Front-End layer's name can be + accessed via the :python:`name` attribute. This may be the input + layer, if the true image is requested, or it may be any layer that + outputs a valid image tensor. This means it must be either + a 2-D tensor (greyscale image) or a 3-D tensor with the channel + dimension equal to 1 or 3 (greyscale or RGB, respectively). + ++ :python:`epoch_interval`: Epoch frequency to output images. The + default value is 1; that is, perform the output every epoch. + + +--------------------------------------------- +Examples Using Summarize Images Callback +--------------------------------------------- + +Python Front-End +-------------------- + +.. _sample_id_strategy_example: + +Track Sample IDs Strategy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: There is currenly no built-in way to print the + original images using a single callback instance. As a + work-around, if the original image is desired, add a + second instance of the :python:`CallbackSummarizeImages` + with the :python:`image_source_layer_name` field set to + the input layer's name and the :python:`epoch_interval` + field set to be larger than the total number of epochs you + expect to run (so it will only output from epoch 0 and + never again). + +.. code-block:: python + + # Set up image selection strategy + img_strategy = lbann.TrackSampleIDsStrategy( + input_layer_name="input", + num_tracked_images=10) + + # Pass parameters to callback + summarize_images = lbann.CallbackSummarizeImages( + selection_strategy=img_strategy, + image_source_layer_name="reconstruction", + epoch_interval=5) + + # Optional- Output original image from input layer once using + # a high epoch interval + summarize_input_layer = lbann.CallbackSummarizeImages( + selection_strategy=img_strategy, + image_source_layer_name="input", + epoch_interval=10000) + +.. _cat_acc_strategy_example: + +Categorical Accuracy Strategy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Set up categorical accuracy layer + accuracy = lbann.CategoricalAccuracy(prediction_scores, labels) + + # Set up image selection criteria + match_type = lbann.CategoricalAccuracyStrategy.MatchType + + # Set up image selection strategy + img_strategy = lbann.CategoricalAccuracyStrategy( + cat_accuracy_layer_name=accuracy.name, + match_type.NOMATCH, + num_images=10) + + # Pass parameters to callback + summarize_images = lbann.CallbackSummarizeImages( + selection_strategy=img_strategy, + image_source_layer_name=images.name, + epoch_interval=5) + + +Profotext (Advanced) +---------------------- + +Track Sample IDs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: guess + + callback { + summarize_images { + selection_strategy { + track_sample_ids { + input_layer_name: "input" + num_tracked_images: 10 + } + image_source_layer_name: "reconstruction" + epoch_interval: 1 + } + } + } + + +Categorical Accuracy Strategy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: guess + + # Set up categorical accuracy layer + layer { + parents: "prob label" + name: "accuracy" + data_layout: "data_parallel" + categorical_accuracy {} + } + + # Set up callback + callback { + summarize_images { + selection_strategy { + categorical_accuracy { + cat_accuracy_layer_name: "accuracy" + num_images: 10 + } + image_source_layer_name: "images" + epoch_interval: 1 + img_format: ".jpg" + } + } + } + +.. toctree:: + :hidden: + + selection_strategy/categorical_accuracy_strategy.rst + selection_strategy/track_sample_ids_strategy.rst diff --git a/docs/index.rst b/docs/index.rst index e4603712d64..40536b6f2a6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,8 +29,10 @@ Users are advised to view `the Doxygen API Documentation :maxdepth: 2 :caption: Getting Started + quick_start building_lbann running_lbann + callbacks .. toctree:: :maxdepth: 1 diff --git a/docs/quick_start.rst b/docs/quick_start.rst new file mode 100644 index 00000000000..26a0693470b --- /dev/null +++ b/docs/quick_start.rst @@ -0,0 +1,368 @@ +.. role:: bash(code) + :language: bash + +==================== +Quick Start +==================== + +-------------------- +What can LBANN do? +-------------------- + +The Livermore Big Artificial Neural Network toolkit (LBANN) is an HPC-centric +deep learning training framework that works across multiple levels of +parallelism. LBANN is capable of taking advantage of HPC hardware to +accelerate the training of deep learning models on massive datasets. + + +-------------------- +Installing LBANN +-------------------- + +LBANN supports installation through Spack and CMake. We recommend using the +Spack installation instructions below. If the Spack install fails, try using +the :ref:`CMake install `. + +1. Download and install `Spack `_. Enable the + additional Spack commands for module files described `here + `_: + + .. code-block:: bash + + source ${SPACK_ROOT}/share/spack/setup-env.sh + +2. Users that are `familiar with Spack + `_ + and already have a `custom Spack ecosystem + `_ can install + LBANN with: + + .. code-block:: bash + + spack install lbann + + A complete list of LBANN install options can be found with: + + .. code-block:: bash + + spack info lbann + + For users new to Spack, LBANN provides a script that will perform some + basic configuration (e.g., add paths to externally installed packages) and + install LBANN in a Spack environment. *This script is only tested and + maintained for systems at LLNL, NERSC, and ORNL. If you are not running on + a system at one of these institutions, you may try the Spack install above + or the :ref:`CMake install `.* To use this installation + script, clone the repository and run the script: + + .. code-block:: bash + + git clone https://github.com/llnl/lbann + cd ./lbann + ./scripts/install_lbann.sh -e lbann + + View other options available by passing the :code:`-h` option to the + script. + +.. note:: It is recommended that your Spack environment take advantage + of locally installed tools. Unless your Spack environment + is explicitly told about tools such as CMake, Python, MPI, + etc., it will install everything that LBANN and all of its + dependencies require. This can take quite a long time but + only has to be done once for a given spack repository. Once + all of the standard tools are installed, rebuilding LBANN + with Spack is quite fast. + + Advice on setting up paths to external installations is + beyond the scope of this document but is covered in the + `Spack Documentation + `_. + + +-------------------- +Test LBANN Install +-------------------- + +1. If you used the :code:`install_lbann.sh` script for installation or + installed in a Spack environment, you will need to activate the Spack LBANN + environment: + + .. code-block:: bash + + spack env activate -p lbann + +2. Test an implementation of the `LeNet neural network + `_ on the `MNIST data set + `_ at :code:`/applications/vision/lenet.py` to verify that your LBANN installation + is working correctly: + + .. code-block:: bash + + cd /applications/vision/ + python3 lenet.py + + Running this Python script will automatically submit a job to the system + scheduler. If LBANN was built successfully, you should see output from + LBANN about loading the data, building the network, and training the model. + + If LBANN fails to run, you can view the generated job script and log files, + and run the job manually: + + .. code-block:: bash + + ls ./\*_lbann_lenet + + If this also fails, you may try building LBANN again using the :ref:`CMake + install instructions `. + + +-------------------- +Basic Usage +-------------------- + +A typical workflow involves the following steps: + +1. Configuring a :python:`Trainer`. + +2. Configuring LBANN model components (like the graph of + :python:`Layer` s) and creating a :python:`Model`. + + + Classes for model components are automatically generated from the + LBANN Protobuf specifications in `lbann/src/proto + `_. These + files are currently the best source of documentation. Message + fields in the Protobuf specification are optional keyword + arguments for the corresponding Python class constructor. If a + keyword argument is not provided, it is logically zero (e.g. false + for Boolean fields and empty for string fields) + +3. Configuring the default :python:`Optimizer` to be used by the + :python:`Weights` objects. + +4. Loading in a Protobuf text file describing the data reader. + + + The Python frontend currently does not have good support for + specifying data readers. If any data reader properties need to be + set programmatically, the user must do it directly via the + Protobuf Python API. + +5. Launching LBANN by calling :python:`run`. + + + :python:`lbann.run` should be run from a compute node. If a node + allocation is not available, the :python:`batch_job` option can + be set to submit a batch job to the scheduler. + + + A timestamped work directory will be created each time LBANN is + run. The default location of these work directories can be set + with the environment variable :bash:`LBANN_EXPERIMENT_DIR`. + + + Supported job managers are Slurm and LSF. + + + LLNL users and collaborators may prefer to use + :python:`lbann.contrib.launcher.run`. This is similar to + :python:`lbann.run`, with defaults and optimizations for certain + systems. + + +-------------------- +PyTorch to LBANN +-------------------- + +The LBANN Python API is very similar to the PyTorch API. In order to help +users familiar with PyTorch transition to LBANN, we prepared the following +guide: + +~~~~~~~~~~~~~~~~~~~~ +Loading Data +~~~~~~~~~~~~~~~~~~~~ +Both LBANN and PyTorch use similar strategies for loading data into models. +With PyTorch, we can load the `MNIST dataset +`_ using the included +:python:`DataLoader`: + + .. code-block:: python + + import torch + from torchvision import datasets, transforms + + batch_size = 64 + data_loader = torch.utils.data.DataLoader( + datasets.MNIST('data', train=True, download=True, + transform=transforms.ToTensor()), + batch_size=batch_size) + +With LBANN, you can write custom data reader functions that use protobuf files +to define the input data and transform it into the input tensors for your +model: + + .. code-block:: python + + import os + import lbann + from google.protobuf import text_format + + def make_data_reader(data_dir): + protobuf_file = os.path.join(data_dir, 'data_reader.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(protobuf_file, 'r') as f: + text_format.Merge(f.read(), message) + message = message.data_reader + message.reader[0].data_filedir = data_dir + + return message + + data_reader = make_data_reader(os.path.realpath('./mnist_data/')) + +This reader assumes that the files `train-images-idx3-ubyte +`_, +`train-labels-idx1-ubyte +`_, and +:code:`data_reader.prototext` are located in the :bash:`./mnist_data` +directory. The :code:`data_read.prototext` file contains the following: + + .. code-block:: protobuf + + data_reader { + reader { + name: "mnist" + role: "train" + shuffle: true + data_filedir: "mnist_data" + data_filename: "train-images-idx3-ubyte" + label_filename: "train-labels-idx1-ubyte" + validation_percent: 0.1 + percent_of_data_to_use: 1.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 + } + } + } + } + +~~~~~~~~~~~~~~~~~~~~ +Building a Model +~~~~~~~~~~~~~~~~~~~~ + +Building models in LBANN is similar to building models in PyTorch. +For example, we can define a simple PyTorch model for the MNIST dataset with: + + .. code-block:: python + + import torch.nn as nn + import torch.nn.functional as F + + class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv = nn.Conv2d(1, 20, kernel_size=5) + self.fc = nn.Linear(12*12*20, 10) + + def forward(self, x): + x = self.conv(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = x.view(x.size(0), -1) + x = self.fc(x) + x = F.log_softmax(x, dim=1) + return x + + net = Net() + + +Using LBANN, that same neural network can be built with: + + .. code-block:: python + + input_ = lbann.Input() + images = lbann.Identity(input_) + labels = lbann.Identity(input_) + + x = lbann.Convolution(images, num_dims=2, num_output_channels=20, + num_groups=1, conv_dims_i=5, conv_strides_i=1, + conv_dilations_i=1, has_bias=True) + x = lbann.Relu(x) + x = lbann.Pooling(x, num_dims=2, pool_dims_i=2, + pool_strides_i=2, pool_mode='max') + x = lbann.FullyConnected(x, num_neurons=10, has_bias=True) + probs = lbann.Softmax(x) + + loss = lbann.CrossEntropy(probs, labels) + + model = lbann.Model(epochs=5, + layers=lbann.traverse_layer_graph(input_), + objective_function=loss, + callbacks=[lbann.CallbackPrintModelDescription(), + lbann.CallbackPrint()]) + +~~~~~~~~~~~~~~~~~~~~ +Setup Model Training +~~~~~~~~~~~~~~~~~~~~ + +Training a model with PyTorch can be achieved by setting a few parameters, +defining an optimizer, and building a training loop: + + .. code-block:: python + + import torch.optim as optim + + learning_rate = 0.01 + momentum = 0.5 + + opt = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum) + + def train(epoch): + net.train() + for batch_idx, (data, target) in enumerate(data_loader): + opt.zero_grad() + output = net(data) + loss = F.nll_loss(output, target) + loss.backward() + opt.step() + + print('Training Epoch: {},\tLoss: {:.3f}'.format(epoch, loss.item())) + +With LBANN, we also define learning parameterrs and an optimizer. With LBANN, +a :python:`Trainer` is provided that negates the need to build your own +training loop: + + .. code-block:: python + + learning_rate = 0.01 + momentum = 0.5 + batch_size = 64 + + opt = lbann.SGD(learn_rate=learning_rate, momentum=momentum) + + trainer = lbann.Trainer(mini_batch_size=batch_size) + +~~~~~~~~~~~~~~~~~~~~ +Run the Experiment +~~~~~~~~~~~~~~~~~~~~ + +Running the experiment in PyTorch is as simple as calling the training loop: + + .. code-block:: python + + for epoch in range(5): + train(epoch) + +Running the experiment in LBANN is just as easy: + + .. code-block:: python + + import lbann.contrib.launcher + lbann.contrib.launcher.run(trainer, model, data_reader, + opt, job_name='mnist-test') + +Python acts only as a frontend for LBANN. The above commands will +automatically generate a batch job script and submit it to the system +scheduler. You can see the job script and associated job files in the +:bash:`./*mnist-test/` directory. + +.. note:: The LBANN :python:`launcher.run` can accept additional arguments to + specify additional scheduler and job parameters. LBANN provides + methods that help with these parameters at + :python:`lbann.contrib.args.add_scheduler_arguments()` and + :python:`lbann.contrib.args.get_scheduler_kwargs()`. diff --git a/docs/running_lbann.rst b/docs/running_lbann.rst index 124d9d28cbf..5223d55cad2 100644 --- a/docs/running_lbann.rst +++ b/docs/running_lbann.rst @@ -148,7 +148,9 @@ Model components + Callback: Function that is performed at various points during an experiment. Callbacks are helpful for reporting, debugging, and - performing advanced training techniques. + performing advanced training techniques. Please consult the :ref: + `Callback` documentation for detailed descriptions of + the callbacks. - This is the natural home for experimental training techniques. diff --git a/include/lbann/data_coordinator/data_coordinator.hpp b/include/lbann/data_coordinator/data_coordinator.hpp index e4a9ae06c01..3f568b154bd 100644 --- a/include/lbann/data_coordinator/data_coordinator.hpp +++ b/include/lbann/data_coordinator/data_coordinator.hpp @@ -35,7 +35,9 @@ #include #include #include - +#ifdef LBANN_HAS_DISTCONV +#include "lbann/data_readers/data_reader_hdf5.hpp" +#endif // LBANN_HAS_DISTCONV namespace lbann { @@ -147,6 +149,7 @@ class data_coordinator { map[data_reader_target_mode::CLASSIFICATION] = std::vector(1, dr->get_num_labels()); map[data_reader_target_mode::REGRESSION] = std::vector(1, dr->get_num_responses()); map[data_reader_target_mode::RECONSTRUCTION] = dr->get_data_dims(); + map[data_reader_target_mode::LABEL_RECONSTRUCTION] = dr->get_data_dims(); map[data_reader_target_mode::NA] = std::vector(1, 0); return map; } @@ -182,6 +185,10 @@ class data_coordinator { DataReaderMetaData drm; drm.data_dims = get_data_dims(); drm.slice_points = get_slice_points(); +#ifdef LBANN_HAS_DISTCONV + const auto training_dr = m_data_readers[execution_mode::training]; + drm.shuffle_required = training_dr->is_tensor_shuffle_required(); +#endif // LBANN_HAS_DISTCONV return drm; } diff --git a/include/lbann/data_coordinator/data_coordinator_metadata.hpp b/include/lbann/data_coordinator/data_coordinator_metadata.hpp index d9c37f23527..8206a4a11a5 100644 --- a/include/lbann/data_coordinator/data_coordinator_metadata.hpp +++ b/include/lbann/data_coordinator/data_coordinator_metadata.hpp @@ -30,6 +30,7 @@ #include #include "lbann/utils/enum_iterator.hpp" +#include "lbann/utils/distconv.hpp" #include #include @@ -38,7 +39,7 @@ namespace lbann { // NA - Not applicable, used for input layers that don't produce a second output -enum class data_reader_target_mode {CLASSIFICATION, REGRESSION, RECONSTRUCTION, INPUT, NA}; +enum class data_reader_target_mode {CLASSIFICATION, REGRESSION, RECONSTRUCTION, LABEL_RECONSTRUCTION, INPUT, NA}; std::string to_string(data_reader_target_mode m); /// Map from target modes to dimension maps using TargetModeDimMap = std::unordered_map>; @@ -57,6 +58,12 @@ using slice_points_mode_iterator = enum_iterator #include #include @@ -91,7 +93,7 @@ class generic_data_reader { m_world_master_mini_batch_adjustment(0), m_num_parallel_readers(0), m_rank_in_model(0), m_max_files_to_load(0), - m_file_dir(""), m_data_index_list(""), m_data_fn(""), m_label_fn(""), + m_file_dir(""), m_data_sample_list(""), m_data_fn(""), m_label_fn(""), m_shuffle(shuffle), m_absolute_sample_count(0), m_validation_percent(0.0), m_use_percent(1.0), m_master(false), @@ -103,6 +105,7 @@ class generic_data_reader { m_procs_per_partition(1), m_io_thread_pool(nullptr), m_jag_partitioned(false), + m_keep_sample_order(false), m_trainer(nullptr), m_issue_warning(true) { @@ -165,16 +168,22 @@ class generic_data_reader { std::string get_local_file_dir() const; /** - * Set the index list for your data (images, etc). - * The index lists contains an enumeration of all samples in the + * Set the sample list for your data (images, etc). + * The sample lists contains an enumeration of all samples in the * data set. */ - void set_data_index_list(std::string s); + void set_data_sample_list(std::string s); + + /** + * Returns the complete sample list for your data set. + */ + std::string get_data_sample_list() const; /** - * Returns the complete index list for your data set. + * To facilictate the testing, maintain the order of loaded samples + * in the sample list as it is in the list file. */ - std::string get_data_index_list() const; + void keep_sample_order(bool same_order = false); /** * Set the filename for your data (images, etc). @@ -596,9 +605,9 @@ class generic_data_reader { /// returns true if the data set is partitioned bool is_partitioned() const { return m_is_partitioned; } - /// Does the data reader have a unqiue index list per model + /// Does the data reader have a unqiue sample list per model virtual bool has_list_per_model() const { return false; } - /// Does the data reader have a unqiue index list per trainer + /// Does the data reader have a unqiue sample list per trainer virtual bool has_list_per_trainer() const { return false; } @@ -663,6 +672,14 @@ class generic_data_reader { m_transform_pipeline = std::move(tp); } +#ifdef LBANN_HAS_DISTCONV + /** + * Returns whether shuffle (which refers to input data shuffling for + * Distconv but not random sample shuffling) is required. + */ + virtual bool is_tensor_shuffle_required() const { return true; } +#endif // LBANN_HAS_DISTCONV + protected: bool m_verbose = false; @@ -782,7 +799,7 @@ class generic_data_reader { size_t m_max_files_to_load; std::string m_file_dir; std::string m_local_file_dir; - std::string m_data_index_list; + std::string m_data_sample_list; std::string m_data_fn; std::string m_label_fn; bool m_shuffle; @@ -855,6 +872,10 @@ class generic_data_reader { /// owns a unique subset of the data bool m_jag_partitioned; + /** Whether to keep the order of loaded samples same as it is in the + * file to make testing and validation easier */ + bool m_keep_sample_order; + /// called by fetch_data a single time if m_jag_partitioned = true; /// this sets various member variables (num_iterations, m_reset_mini_batch_index, /// etc. diff --git a/include/lbann/data_readers/data_reader_hdf5.hpp b/include/lbann/data_readers/data_reader_hdf5.hpp new file mode 100644 index 00000000000..1719d542256 --- /dev/null +++ b/include/lbann/data_readers/data_reader_hdf5.hpp @@ -0,0 +1,141 @@ +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// +#ifndef LBANN_DATA_READER_HDF5_HPP +#define LBANN_DATA_READER_HDF5_HPP +#include "data_reader_image.hpp" +#include "hdf5.h" +#include "conduit/conduit.hpp" + +namespace lbann { +/** + * Data reader for data stored in HDF5 files. This data reader was + * designed to work with Distconv. This currently has two different + * modes: + * * Datasets with 3D data and a few numbers of responses: + * This mode assumes a 3D cube dataset such as the CosmoFlow dataset. + * This requires set_has_responses to be called on setup. + * * Datasets with 3D data and 3D labels: + * This mode assumes 3D cubes with corresponding 3D label tensors + * such as the LiTS dataset. This requires set_has_labels to be + * called on setup, and label_reconstruction should be used for the + * input layer. + * + * Each HDF5 file should contain hdf5_key_data, hdf5_key_labels, and + * hdf5_key_responses keys to read data, labels and responses + * respectively. + */ +template +class hdf5_reader : public generic_data_reader { + public: + hdf5_reader(const bool shuffle, + const std::string key_data, + const std::string key_label, + const std::string key_responses, + const bool hyperslab_labels); + hdf5_reader(const hdf5_reader&); + hdf5_reader& operator=(const hdf5_reader&); + ~hdf5_reader() override {} + + hdf5_reader* copy() const override { return new hdf5_reader(*this); } + + void copy_members(const hdf5_reader& rhs); + + std::string get_type() const override { + return "data_reader_hdf5_images"; + } + //void set_input_params(int width, int height, int depth, int num_ch, int num_labels); + void load() override; + void set_hdf5_paths(const std::vector hdf5_paths) {m_file_paths = hdf5_paths;} + + void set_has_labels(const bool b) { m_has_labels = b; } + void set_has_responses(const bool b) { m_has_responses = b; } + void set_num_responses(const size_t num_responses) { + m_all_responses.resize(num_responses); + } + + int get_num_labels() const override { + if(!m_has_labels) { + return generic_data_reader::get_num_labels(); + } + // This data reader currently assumes that the shape of the label + // tensor is the same to the data tensor. + return m_num_features; + } + int get_num_responses() const override { + if(!m_has_responses) { + return generic_data_reader::get_num_responses(); + } + return get_linearized_response_size(); + } + int get_linearized_data_size() const override { + return m_num_features; + } + int get_linearized_label_size() const override { + if(!m_has_labels) { + return generic_data_reader::get_linearized_label_size(); + } + // This data reader currently assumes that the shape of the label + // tensor is the same to the data tensor. + return m_num_features; + } + int get_linearized_response_size() const override { + if(!m_has_responses) { + return generic_data_reader::get_linearized_response_size(); + } + return m_all_responses.size(); + } + const std::vector get_data_dims() const override { + return m_data_dims; + } + +#ifdef LBANN_HAS_DISTCONV + bool is_tensor_shuffle_required() const override { return false; } +#endif // LBANN_HAS_DISTCONV + + protected: + void read_hdf5_hyperslab(hsize_t h_data, hsize_t filespace, int rank, + TensorDataType *sample); + void read_hdf5_sample(int data_id, TensorDataType *sample, TensorDataType *labels); + //void set_defaults() override; + bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; + void fetch_datum_conduit(Mat& X, int data_id); + bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; + bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override; + void gather_responses(float *responses); + hid_t get_hdf5_data_type() const; + conduit::DataType get_conduit_data_type(conduit::index_t num_elements) const; + + /// Whether to fetch a label from the last column. + bool m_has_labels = false; + /// Whether to fetch a response from the last column. + bool m_has_responses = false; + int m_image_depth = 0; + size_t m_num_features; + std::vector m_all_responses; + std::vector m_file_paths; + MPI_Comm m_comm; + std::vector m_data_dims; + std::vector m_hyperslab_dims; + hid_t m_fapl; + hid_t m_dxpl; + MPI_Comm m_response_gather_comm; + bool m_use_data_store; + std::string m_key_data, m_key_labels, m_key_responses; + bool m_hyperslab_labels; + + private: + static const std::string HDF5_KEY_DATA, HDF5_KEY_LABELS, HDF5_KEY_RESPONSES; +}; +} +#endif // LBANN_DATA_READER_HDF5_HPP diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index cde595e781e..37c6f111e73 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -30,6 +30,7 @@ #define IMAGE_DATA_READER_HPP #include "data_reader.hpp" +#include "sample_list.hpp" #include "lbann/data_store/data_store_conduit.hpp" namespace lbann { @@ -38,6 +39,10 @@ class image_data_reader : public generic_data_reader { using img_src_t = std::string; using label_t = int; using sample_t = std::pair; + using sample_name_t = img_src_t; + using sample_list_t = sample_list; + using sample_idx_t = sample_list_t::sample_idx_t; + using labels_t = std::vector; image_data_reader(bool shuffle = true); image_data_reader(const image_data_reader&); @@ -78,21 +83,16 @@ class image_data_reader : public generic_data_reader { return {m_image_num_channels, m_image_height, m_image_width}; } - /// Return the sample list of current minibatch - std::vector get_image_list_of_current_mb() const; - /// Allow read-only access to the entire sample list - const std::vector& get_image_list() const { - return m_image_list; + const sample_list_t& get_sample_list() const { + return m_sample_list; } /** * Returns idx-th sample in the initial loading order. * The second argument is only to facilitate overloading, and not to be used by users. */ - sample_t get_sample(const size_t idx) const { - return m_image_list.at(idx); - } + sample_t get_sample(const size_t idx) const; void do_preload_data_store() override; @@ -106,15 +106,33 @@ class image_data_reader : public generic_data_reader { bool fetch_label(Mat& Y, int data_id, int mb_idx) override; void set_linearized_image_size(); + /** Dump the image list file in which each line consists of the file name + * and the label of a sample */ + void dump_sample_label_list(const std::string& dump_file_name); + /// Rely on pre-determined list of samples. + void load_list_of_samples(const std::string filename); + /// Load the sample list from a serialized archive from another rank + void load_list_of_samples_from_archive(const std::string& sample_list_archive); + /// Use the imagenet image list file, and generate sample list header on-the-fly + void gen_list_of_samples(); + /// Load the labels for samples + void load_labels(std::vector& preloaded_buffer); + /// Read the labels from an open input stream + void read_labels(std::istream& istrm); + /// Return the number of lines in the input stream + size_t determine_num_of_samples(std::istream& istrm) const; + std::string m_image_dir; ///< where images are stored - std::vector m_image_list; ///< list of image files and labels int m_image_width; ///< image width int m_image_height; ///< image height int m_image_num_channels; ///< number of image channels int m_image_linearized_size; ///< linearized image size int m_num_labels; ///< number of labels - bool load_conduit_nodes_from_file(const std::unordered_set &data_ids); + sample_list_t m_sample_list; + labels_t m_labels; + + bool load_conduit_nodes_from_file(const std::unordered_set &data_ids); }; diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index be53df9aced..558a766094d 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -169,9 +169,9 @@ class data_reader_jag_conduit : public generic_data_reader { void check_image_data(); #endif // _JAG_OFFLINE_TOOL_MODE_ - /// Set every reader instances in a trainer to have an independent index list + /// Set every reader instances in a trainer to have an independent sample list void set_list_per_trainer(bool flag) { m_list_per_trainer = flag; }; - /// Set every reader instances in a model to have an independent index list + /// Set every reader instances in a model to have an independent sample list void set_list_per_model(bool flag) { m_list_per_model = flag; }; bool has_list_per_model() const override { return m_list_per_model; } @@ -316,8 +316,10 @@ class data_reader_jag_conduit : public generic_data_reader { * the number of models and the mini batch size. */ bool check_num_parallel_readers(long data_set_size); + /// Check the consistency of the schema of the first sample + void sample_schema_check(const bool check_data); /// Rely on pre-determined list of samples. - void load_list_of_samples(const std::string filename, size_t stride=1, size_t offset=0); + void load_list_of_samples(const std::string filename); /// Load the sample list from a serialized archive from another rank void load_list_of_samples_from_archive(const std::string& sample_list_archive); diff --git a/include/lbann/data_readers/sample_list.hpp b/include/lbann/data_readers/sample_list.hpp index 6d4aa5e051f..ad64dff14d0 100644 --- a/include/lbann/data_readers/sample_list.hpp +++ b/include/lbann/data_readers/sample_list.hpp @@ -18,28 +18,49 @@ namespace lbann { -static const std::string sample_exclusion_list = "CONDUIT_HDF5_EXCLUSION"; -static const std::string sample_inclusion_list = "CONDUIT_HDF5_INCLUSION"; +static const std::string multi_sample_exclusion = "MULTI-SAMPLE_EXCLUSION"; +static const std::string multi_sample_inclusion = "MULTI-SAMPLE_INCLUSION"; +static const std::string single_sample = "SINGLE-SAMPLE"; struct sample_list_header { + /// Whether each data file includes multiple samples + bool m_is_multi_sample; + /// Whether to list the IDs of samples to exclude or to include bool m_is_exclusive; + /// Whether to read the header line for a label file + bool m_no_label_header; /// Number of included samples size_t m_included_sample_count; /// Number of excluded samples size_t m_excluded_sample_count; size_t m_num_files; + /// Data file directory std::string m_file_dir; - std::string m_sample_list_filename; + std::string m_sample_list_name; + std::string m_label_filename; sample_list_header(); + void set_sample_list_type(const std::string& line1); + void set_sample_count(const std::string& line2); + void set_data_file_dir(const std::string& line3); + void set_label_filename(const std::string& line4); + + bool is_multi_sample() const; bool is_exclusive() const; + bool use_label_header() const; size_t get_sample_count() const; size_t get_num_files() const; - const std::string& get_sample_list_filename() const; const std::string& get_file_dir() const; + const std::string& get_sample_list_name() const; + /// Save the filename or stream name of this sample list for debugging + void set_sample_list_name(const std::string& n); + const std::string& get_label_filename() const; template void serialize( Archive & ar ) { - ar(m_is_exclusive, m_included_sample_count, m_excluded_sample_count, m_num_files, m_file_dir, m_sample_list_filename); + ar(m_is_multi_sample, m_is_exclusive, m_no_label_header, + m_included_sample_count, m_excluded_sample_count, + m_num_files, m_file_dir, + m_sample_list_name, m_label_filename); } }; @@ -53,6 +74,10 @@ class sample_list { using sample_t = std::template pair; /// Type for the list of samples using samples_t = std::template vector< sample_t >; + /// Type for the index into the sample list + using sample_idx_t = typename samples_t::size_type; + /// Type for the map from sample name to the sample list index + using sample_map_t = std::unordered_map; /// Mapping of the file index to the filename using file_id_stats_v_t = std::vector< std::string >; @@ -64,11 +89,17 @@ class sample_list { void copy_members(const sample_list& rhs); - /// Load a sample list file - void load(const std::string& samplelist_file, size_t stride=1, size_t offset=0); + /// Load a sample list file using the given stride and offset on the sample sequence + void load(std::istream& istrm, size_t stride=1, size_t offset=0); - /// Load the header of a sample list file - sample_list_header load_header(const std::string& samplelist_file) const; + /** Load a sample list file using the stride as the number of processes per + * trainer and the offset as the current rank within the trainer if + * interleaving option is on. + */ + void load(const std::string& samplelist_file, const lbann_comm& comm, bool interleave); + void load(std::istream& istrm, const lbann_comm& comm, bool interleave); + /// Load sample list using the given header instead of reading it from the input stream + void load(const sample_list_header& header, std::istream& istrm, const lbann_comm& comm, bool interleave); /// Restore a sample list from a serialized string void load_from_string(const std::string& samplelist); @@ -103,6 +134,7 @@ class sample_list { virtual const std::string& get_samples_filename(sample_file_id_t id) const; const std::string& get_samples_dirname() const; + const std::string& get_label_filename() const; void all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); void all_gather_archive_new(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); @@ -110,13 +142,33 @@ class sample_list { template size_t all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm); virtual void all_gather_packed_lists(lbann_comm& comm); + /// Set to maintain the original sample order as listed in the file + void keep_sample_order(bool keep); + + /// Manually set the sample list name, which can be used for stream-based sources + void set_sample_list_name(const std::string& n); + + /// Set to check the existence of data file in the list + void set_data_file_check(); + /// Set not to check the existence of data file in the list + void unset_data_file_check(); + + /// Build map from sample names to indices for sample list + void build_sample_map_from_name_to_index(); + + /// Clear the map from sample names to indices + void clear_sample_map_from_name_to_index(); + + /// Return the index of the sample with the specified name + sample_idx_t get_sample_index(const sample_name_t& sn ); + protected: /// Reads a header line from the sample list given as a stream, and use the info string for error message - std::string read_header_line(std::istream& ifs, const std::string& filename, const std::string& info) const; + std::string read_header_line(std::istream& ifs, const std::string& listname, const std::string& info); /// Reads the header of a sample list - sample_list_header read_header(std::istream& istrm, const std::string& filename) const; + void read_header(std::istream& istrm); /// read the body of a sample list, which is the list of sample files, where each file contains a single sample. virtual void read_sample_list(std::istream& istrm, size_t stride=1, size_t offset=0); @@ -125,7 +177,7 @@ class sample_list { virtual void assign_samples_name(); /// Reads a sample list and populates the internal list - size_t get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride=1, size_t offset=0); + size_t get_samples_per_file(std::istream& istrm, size_t stride=1, size_t offset=0); /// Add the header info to the given string void write_header(std::string& sstr, size_t num_files) const; @@ -135,14 +187,29 @@ class sample_list { virtual void set_samples_filename(sample_file_id_t id, const std::string& filename); + /// Reorder the sample list to its initial order + virtual void reorder(); + protected: /// header info of sample list sample_list_header m_header; - private: + /// The stride used in loading sample list file + size_t m_stride; + + /// maintain the original sample order as listed in the file + bool m_keep_order; + + /// Whether to check the existence of data file + bool m_check_data_file; + /// List of all samples with a file identifier and sample name for each sample samples_t m_sample_list; + /// Map from sample name to the corresponding index into the sample list + sample_map_t m_map_name_to_idx; + + private: /// Maps sample's file id to file names, file descriptors, and use counts file_id_stats_v_t m_file_id_stats_map; diff --git a/include/lbann/data_readers/sample_list_impl.hpp b/include/lbann/data_readers/sample_list_impl.hpp index 0f161bed61f..80719640467 100644 --- a/include/lbann/data_readers/sample_list_impl.hpp +++ b/include/lbann/data_readers/sample_list_impl.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -76,15 +77,81 @@ template<> inline std::string to_sample_name_t(const std::string& s //------------------------ inline sample_list_header::sample_list_header() - : m_is_exclusive(false), m_included_sample_count(0u), - m_excluded_sample_count(0u), m_num_files(0u), - m_file_dir("") { + : m_is_multi_sample(false), m_is_exclusive(false), m_no_label_header(false), + m_included_sample_count(0u), m_excluded_sample_count(0u), m_num_files(0u), + m_file_dir(""), m_sample_list_name(""), m_label_filename("") { +} + +inline void sample_list_header::set_sample_list_type(const std::string& line1) { + std::stringstream header1(line1); + std::string sample_list_type; + header1 >> sample_list_type; + + std::for_each(sample_list_type.begin(), sample_list_type.end(), + [](char& c){ c = std::toupper(c); }); + + m_is_multi_sample = false; + m_is_exclusive = false; + m_no_label_header = false; + + if (sample_list_type == single_sample) { + } else if (sample_list_type == multi_sample_inclusion) { + m_is_multi_sample = true; + m_is_exclusive = false; + } else if (sample_list_type == multi_sample_exclusion) { + m_is_multi_sample = true; + m_is_exclusive = true; + } else if (sample_list_type == "CONDUIT_HDF5_INCLUSION") { + // For backward compatibility + m_is_multi_sample = true; + m_is_exclusive = false; + m_no_label_header = true; // old format does not use a line for label file + } else if (sample_list_type == "CONDUIT_HDF5_EXCLUSION") { + // For backward compatibility + m_is_multi_sample = true; + m_is_exclusive = true; + m_no_label_header = true; + } else { + LBANN_ERROR("Unknown sample list type: ", sample_list_type); + } +} + +inline void sample_list_header::set_sample_count(const std::string& line2) { + std::stringstream header2(line2); + if (m_is_multi_sample) { + header2 >> m_included_sample_count; + header2 >> m_excluded_sample_count; + } + header2 >> m_num_files; + + if (!m_is_multi_sample) { + m_included_sample_count = m_num_files; + m_excluded_sample_count = 0ul; + } +} + +inline void sample_list_header::set_data_file_dir(const std::string& line3) { + std::stringstream header3(line3); + header3 >> m_file_dir; +} + +inline void sample_list_header::set_label_filename(const std::string& line4) { + std::stringstream header4(line4); + header4 >> m_label_filename; +} + +inline bool sample_list_header::is_multi_sample() const { + return m_is_multi_sample; } inline bool sample_list_header::is_exclusive() const { return m_is_exclusive; } +inline bool sample_list_header::use_label_header() const { + return !m_no_label_header; +} + inline size_t sample_list_header::get_sample_count() const { return m_included_sample_count; } @@ -93,20 +160,29 @@ inline size_t sample_list_header::get_num_files() const { return m_num_files; } -inline const std::string& sample_list_header::get_sample_list_filename() const { - return m_sample_list_filename; -} - inline const std::string& sample_list_header::get_file_dir() const { return m_file_dir; } +inline const std::string& sample_list_header::get_sample_list_name() const { + return m_sample_list_name; +} + +inline void sample_list_header::set_sample_list_name(const std::string& n) { + m_sample_list_name = n; +} + +inline const std::string& sample_list_header::get_label_filename() const { + return m_label_filename; +} + //------------------ // sample_list //------------------ template -inline sample_list::sample_list() { +inline sample_list::sample_list() +: m_stride(1ul), m_keep_order(true), m_check_data_file(false) { } template @@ -149,6 +225,9 @@ template inline void sample_list ::copy_members(const sample_list& rhs) { m_header = rhs.m_header; + m_stride = rhs.m_stride; + m_keep_order = rhs.m_keep_order; + m_check_data_file = rhs.m_check_data_file; m_sample_list = rhs.m_sample_list; /// Keep track of existing filenames @@ -157,25 +236,53 @@ ::copy_members(const sample_list& rhs) { template inline void sample_list -::load(const std::string& samplelist_file, +::load(std::istream& istrm, size_t stride, size_t offset) { - std::ifstream istr(samplelist_file); - get_samples_per_file(istr, samplelist_file, stride, offset); - istr.close(); + m_stride = stride; + get_samples_per_file(istrm, stride, offset); +} + +template +inline void sample_list +::load(const std::string& samplelist_file, + const lbann_comm& comm, + bool interleave) { + m_header.set_sample_list_name(samplelist_file); + std::ifstream istrm(samplelist_file); + load(istrm, comm, interleave); + istrm.close(); } template -inline sample_list_header sample_list -::load_header(const std::string& samplelist_file) const { - std::ifstream istr(samplelist_file); - return read_header(istr, samplelist_file); +inline void sample_list +::load(std::istream& istrm, + const lbann_comm& comm, + bool interleave) { + const size_t stride = interleave? comm.get_procs_per_trainer() : 1ul; + const size_t offset = interleave? comm.get_rank_in_trainer() : 0ul; + load(istrm, stride, offset); +} + +template +inline void sample_list +::load(const sample_list_header& header, + std::istream& istrm, + const lbann_comm& comm, + bool interleave) { + m_header = header; + const size_t stride = interleave? comm.get_procs_per_trainer() : 1ul; + const size_t offset = interleave? comm.get_rank_in_trainer() : 0ul; + + m_stride = stride; + read_sample_list(istrm, stride, offset); } template inline void sample_list ::load_from_string(const std::string& samplelist) { - std::istringstream istr(samplelist); - get_samples_per_file(istr, "", 1, 0); + m_header.set_sample_list_name(""); + std::istringstream istrm(samplelist); + load(istrm, 1ul, 0ul); } template @@ -199,11 +306,11 @@ ::empty() const { template inline std::string sample_list ::read_header_line(std::istream& istrm, - const std::string& filename, - const std::string& info) const { + const std::string& listname, + const std::string& info) { if (!istrm.good()) { throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) - + " :: unable to read the header line of sample list " + filename + " for " + info); + + " :: unable to read the header line of sample list " + listname + " for " + info); } std::string line; @@ -211,7 +318,7 @@ ::read_header_line(std::istream& istrm, if (line.empty()) { throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) - + " :: unable to read the header line of sample list " + filename + " for " + info + + " :: unable to read the header line of sample list " + listname + " for " + info + " -- the line was empty"); } return line; @@ -219,47 +326,27 @@ ::read_header_line(std::istream& istrm, template -inline sample_list_header sample_list -::read_header(std::istream& istrm, - const std::string& filename) const { - sample_list_header hdr; - - hdr.m_sample_list_filename = filename; - - std::string line1 = read_header_line(istrm, filename, "the exclusiveness"); - std::stringstream header1(line1); - - std::string line2 = read_header_line(istrm, filename, "the number of samples and the number of files"); - std::stringstream header2(line2); - - std::string line3 = read_header_line(istrm, filename, "the data file directory"); - std::stringstream header3(line3); +inline void sample_list +::read_header(std::istream& istrm) { + const std::string listname = m_header.get_sample_list_name(); - std::string sample_list_type; - header1 >> sample_list_type; - std::for_each(sample_list_type.begin(), sample_list_type.end(), [](char& c){ c = std::toupper(c); }); + std::string line1 = read_header_line(istrm, listname, "the exclusiveness\n"); + std::string line2 = read_header_line(istrm, listname, "the number of samples and the number of files\n"); + std::string line3 = read_header_line(istrm, listname, "the data file directory\n"); - const std::string type_exclusive = sample_exclusion_list; - size_t found = sample_list_type.find(type_exclusive); + m_header.set_sample_list_type(line1); + m_header.set_sample_count(line2); + m_header.set_data_file_dir(line3); - if (found != std::string::npos) { - hdr.m_is_exclusive = true; - } else { - hdr.m_is_exclusive = false; + if (m_header.use_label_header()) { + std::string line4 = read_header_line(istrm, listname, "the path to label/response file\n"); + m_header.set_label_filename(line4); } - header2 >> hdr.m_included_sample_count; - header2 >> hdr.m_excluded_sample_count; - header2 >> hdr.m_num_files; - - header3 >> hdr.m_file_dir; - - if (hdr.get_file_dir().empty() || !check_if_dir_exists(hdr.get_file_dir())) { - LBANN_ERROR(std::string{} + "file " + filename - + " :: data root directory '" + hdr.get_file_dir() + "' does not exist."); + if (m_header.get_file_dir().empty() || !check_if_dir_exists(m_header.get_file_dir())) { + LBANN_ERROR(std::string{} + "file " + listname + + " :: data root directory '" + m_header.get_file_dir() + "' does not exist."); } - - return hdr; } @@ -293,9 +380,9 @@ ::read_sample_list(std::istream& istrm, const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename; - if (filename.empty() || !check_if_file_exists(file_path)) { + if (filename.empty() || (m_check_data_file && !check_if_file_exists(file_path))) { throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) - + " :: data file '" + filename + "' does not exist."); + + " :: data file '" + file_path + "' does not exist."); } const sample_file_id_t index = m_file_id_stats_map.size(); @@ -323,10 +410,10 @@ ::read_sample_list(std::istream& istrm, template inline size_t sample_list ::get_samples_per_file(std::istream& istrm, - const std::string& filename, size_t stride, size_t offset) { - m_header = read_header(istrm, filename); + read_header(istrm); + m_stride = stride; read_sample_list(istrm, stride, offset); return size(); @@ -412,7 +499,7 @@ ::all_gather_archive_new(const std::string &archive, for (auto t : packed_sizes) { g += t; } - if (!me) { + if (me == comm.get_trainer_master()) { std::cout << "global archive size: " << g << std::endl; } @@ -420,7 +507,7 @@ ::all_gather_archive_new(const std::string &archive, gathered_archive[p].resize(packed_sizes[p]); if (me == p) { gathered_archive[p] = archive; - } + } int sz = packed_sizes[p]; char *data = const_cast(gathered_archive[p].data()); comm.trainer_broadcast(p, data, sz); @@ -509,23 +596,34 @@ template void sample_list ::serialize( Archive & ar ) { ar(m_header, m_sample_list, m_file_id_stats_map); + // The member variables that are only meaningful during initial loading + // are not included here. + // e.g., m_stride, m_keep_order, m_check_data_file } template inline void sample_list ::write_header(std::string& sstr, size_t num_files) const { - // The first line indicate if the list is exclusive or inclusive - // The next line contains the number of samples (included and excluded), - // as well as the number of files, which are the same in this caes - // The next line contains the root data file directory - - sstr += (m_header.is_exclusive()? sample_exclusion_list + "\n" : sample_inclusion_list + "\n"); - size_t total, included, excluded; - get_num_samples(total, included, excluded); - /// TODO: clarify the comment below - /// Include the number of invalid samples, which for an inclusive index list is always 0 - sstr += std::to_string(included) + ' ' + std::to_string(excluded) + ' ' + std::to_string(num_files) + '\n'; + // The first line indicate if the list is single-sample-per-file type, + // multi-sample-exclusive or multi-sample-inclusive. + // The second line contains the number of samples (included and excluded + // when applicable), as well as the number of files. + // The third line contains the root data file directory. + // The fourth line contains the path to the label file when applicable + + if (m_header.is_multi_sample()) { + sstr += (m_header.is_exclusive()? multi_sample_exclusion + "\n" : multi_sample_inclusion + "\n"); + + size_t total, included, excluded; + get_num_samples(total, included, excluded); + + sstr += std::to_string(included) + ' ' + std::to_string(excluded) + ' ' + std::to_string(num_files) + '\n'; + } else { + sstr += single_sample + "\n"; + sstr += std::to_string(num_files) + '\n'; + } sstr += m_header.get_file_dir() + '\n'; + sstr += m_header.get_label_filename() + '\n'; } template @@ -547,8 +645,21 @@ ::to_string(std::string& sstr) const { sstr.clear(); - // reserve the string to hold the entire sample lit - size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1 + total_len + 1000; + static const size_t max_type_len + = std::max(std::max(multi_sample_exclusion.size(), + multi_sample_inclusion.size()), + single_sample.size()); + + static const size_t max_num_len + = std::to_string(std::numeric_limits::max()).size(); + + // reserve the string to hold the entire sample list + size_t estimated_len = max_type_len + + max_num_len + 2 + + m_header.get_file_dir().size() + + m_header.get_label_filename().size() + + 4 // sizeof('\n') * 4 + + total_len + 1000; sstr.reserve(estimated_len); // write the list header @@ -614,11 +725,17 @@ ::get_samples_filename(sample_file_id_t id) const { } template -inline const std::string& sample_list +inline const std::string& sample_list ::get_samples_dirname() const { return m_header.get_file_dir(); } +template +inline const std::string& sample_list +::get_label_filename() const { + return m_header.get_label_filename(); +} + template inline void sample_list ::set_samples_filename(sample_file_id_t id, const std::string& filename) { @@ -637,7 +754,7 @@ ::assign_samples_name() { } } else if constexpr (std::is_same::value) { for (auto& s: m_sample_list) { - s.second = s.first; + s.second = get_samples_filename(s.first); } } else { LBANN_ERROR(std::string{} + " :: base class does not implement this method" @@ -674,7 +791,7 @@ ::assign_samples_name() { template<> inline void sample_list ::assign_samples_name() { for (auto& s: m_sample_list) { - s.second = s.first; + s.second = get_samples_filename(s.first); } } @@ -739,9 +856,92 @@ ::all_gather_packed_lists(lbann_comm& comm) { } } + if (m_keep_order) { + reorder(); + } + assign_samples_name(); return; } +template +inline void sample_list +::reorder() { + if (m_stride > 1ul) { // undo interleaving + const size_t sz = m_sample_list.size(); + const size_t s = sz/m_stride; + const size_t s_more = (sz + m_stride - 1ul)/m_stride; + const size_t n_more = sz - s * m_stride; + + samples_t tmp_sample_list; + tmp_sample_list.reserve(s_more * m_stride); + + for (size_t i = 0ul; i < s_more; ++i) { + for (size_t j = i, k = 0ul; j < sz; ++k) { + tmp_sample_list.push_back(m_sample_list[j]); + //if (tmp_sample_list.size() == sz) break; + j += ((k < n_more)? s_more : s); + } + } + tmp_sample_list.resize(sz); + std::swap(m_sample_list, tmp_sample_list); + m_stride = 1ul; + } +} + +template +inline void sample_list +::build_sample_map_from_name_to_index() { + m_map_name_to_idx.clear(); + for (size_t i = 0ul; i < m_sample_list.size(); ++i) { + m_map_name_to_idx.insert(std::make_pair(m_sample_list[i].second, i)); + } +} + +template +inline void sample_list +::clear_sample_map_from_name_to_index() { + m_map_name_to_idx.clear(); + m_map_name_to_idx.rehash(0); + sample_map_t tmp; + tmp.rehash(0); + tmp.swap(m_map_name_to_idx); +} + +template +inline typename sample_list::sample_idx_t sample_list +::get_sample_index(const sample_name_t& sn) { + typename sample_map_t::const_iterator it = m_map_name_to_idx.find(sn); + if (it == m_map_name_to_idx.cend()) { + return size(); + //LBANN_ERROR(" :: cannot find the sample name ", lbann::to_string(sn)); + } + return it->second; +} + +template +inline void sample_list +::keep_sample_order(bool keep) { + m_keep_order = keep; +} + +template +inline void sample_list +::set_sample_list_name(const std::string& n) { + m_header.set_sample_list_name(n); +} + +template +inline void sample_list +::set_data_file_check() { + m_check_data_file = true; +} + +template +inline void sample_list +::unset_data_file_check() { + m_check_data_file = false; +} + } // end of namespace lbann diff --git a/include/lbann/data_readers/sample_list_open_files.hpp b/include/lbann/data_readers/sample_list_open_files.hpp index 57bfb89980e..4eac42c5c47 100644 --- a/include/lbann/data_readers/sample_list_open_files.hpp +++ b/include/lbann/data_readers/sample_list_open_files.hpp @@ -16,13 +16,13 @@ class sample_list_open_files : public sample_list { using sample_file_id_t = std::size_t; /** To describe a sample as a pair of the file to which it belongs and its name Each file may contain multiple samples. */ - using sample_t = std::pair; + using sample_t = typename sample_list::sample_t; /// Information for each file used by the sample list: includes the file name, file descriptor, and /// and a queue of each step and substep when data will be loaded from the file using file_id_stats_t = std::tuple>>; /// Type for the list of samples - using samples_t = std::template vector< sample_t >; + using samples_t = typename sample_list::samples_t; /// Mapping of the file index to the statistics for each file using file_id_stats_v_t = std::vector< file_id_stats_t >; // rename to sample_to_file_v or something /// Type for the map of file descriptors to usage step and substep @@ -55,12 +55,6 @@ class sample_list_open_files : public sample_list { /// Serialize this sample list into an std::string object bool to_string(std::string& sstr) const override; - /// Allow read-only access to the internal list data - const samples_t& get_list() const; - - /// Allow read-only access to the metadata of the idx-th sample in the list - const sample_t& operator[](size_t idx) const; - const std::string& get_samples_filename(sample_file_id_t id) const override; file_handle_t get_samples_file_handle(sample_file_id_t id) const; @@ -130,9 +124,6 @@ class sample_list_open_files : public sample_list { file_id_stats_v_t m_file_id_stats_map; private: - /// List of all samples with a file identifier and sample name for each sample - samples_t m_sample_list; - /// Track the number of samples per file std::unordered_map m_file_map; diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp index 565b016bd22..ceec5493ab6 100644 --- a/include/lbann/data_readers/sample_list_open_files_impl.hpp +++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp @@ -48,7 +48,6 @@ template inline void sample_list_open_files ::copy_members(const sample_list_open_files& rhs) { sample_list::copy_members(rhs); - m_sample_list = rhs.m_sample_list; m_file_map = rhs.m_file_map; m_max_open_files = rhs.m_max_open_files; @@ -71,7 +70,7 @@ ::copy_members(const sample_list_open_files& rhs) { template inline size_t sample_list_open_files ::size() const { - return m_sample_list.size(); + return this->m_sample_list.size(); } template @@ -111,7 +110,7 @@ ::read_exclusive_list(std::istream& istrm, const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename; - if (filename.empty() || !check_if_file_exists(file_path)) { + if (filename.empty() || (this->m_check_data_file && !check_if_file_exists(file_path))) { LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' does not exist."); } @@ -159,7 +158,7 @@ ::read_exclusive_list(std::istream& istrm, if (found != excluded_sample_indices.cend()) { continue; } - m_sample_list.emplace_back(index, to_sample_name_t(s)); + this->m_sample_list.emplace_back(index, to_sample_name_t(s)); valid_sample_count++; } @@ -173,7 +172,7 @@ ::read_exclusive_list(std::istream& istrm, if (m_header.get_num_files() != cnt_files) { LBANN_ERROR(std::string("Sample list ") - + m_header.get_sample_list_filename() + + m_header.get_sample_list_name() + std::string(": number of files requested ") + std::to_string(m_header.get_num_files()) + std::string(" does not equal number of files loaded ") @@ -214,7 +213,7 @@ ::read_inclusive_list(std::istream& istrm, const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename; - if (filename.empty() || !check_if_file_exists(file_path)) { + if (filename.empty() || (this->m_check_data_file && !check_if_file_exists(file_path))) { throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: data file '" + filename + "' does not exist."); } @@ -236,7 +235,7 @@ ::read_inclusive_list(std::istream& istrm, while(!sstr.eof()) { std::string sample_name_str; sstr >> sample_name_str; - m_sample_list.emplace_back(index, to_sample_name_t(sample_name_str)); + this->m_sample_list.emplace_back(index, to_sample_name_t(sample_name_str)); #ifdef VALIDATE_SAMPLE_LIST sample_names.emplace_back(sample_name_str); #endif @@ -296,7 +295,7 @@ ::save( Archive & ar ) const { for(auto&& e : m_file_id_stats_map) { file_stats.emplace_back(std::make_tuple(std::get<0>(e), std::get<2>(e))); } - ar(m_header, m_sample_list, file_stats); + ar(m_header, this->m_sample_list, file_stats); } template @@ -305,7 +304,7 @@ void sample_list_open_files ::load( Archive & ar ) { using ar_file_stats_t = std::tuple>>; std::vector file_stats; - ar(m_header, m_sample_list, file_stats); + ar(m_header, this->m_sample_list, file_stats); m_file_id_stats_map.reserve(file_stats.size()); for(auto&& e : file_stats) { //m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), uninitialized_file_handle(), std::deque>{})); @@ -318,15 +317,28 @@ template inline bool sample_list_open_files ::to_string(std::string& sstr) const { std::map> tmp_file_map; - for (const auto& s : m_sample_list) { + for (const auto& s : this->m_sample_list) { const std::string& filename = get_samples_filename(s.first); tmp_file_map[filename].emplace_back(s.second); } sstr.clear(); - // reserve the string to hold the entire sample lit - size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1; + static const size_t max_type_len + = std::max(std::max(multi_sample_exclusion.size(), + multi_sample_inclusion.size()), + single_sample.size()); + + static const size_t max_num_len + = std::to_string(std::numeric_limits::max()).size(); + + // reserve the string to hold the entire sample list + size_t estimated_len = max_type_len + + max_num_len * 3 + 2 + + m_header.get_file_dir().size() + + m_header.get_label_filename().size() + + 4; + for (const auto& f : tmp_file_map) { estimated_len += f.first.size() + std::to_string(f.second.size()).size() @@ -370,18 +382,6 @@ ::get_num_samples(size_t& total, size_t& included, size_t& excluded) const { excluded = total - included; } -template -inline const typename sample_list_open_files::samples_t& -sample_list_open_files::get_list() const { - return m_sample_list; -} - -template -inline const typename sample_list_open_files::sample_t& -sample_list_open_files::operator[](size_t idx) const { - return m_sample_list[idx]; -} - template inline const std::string& sample_list_open_files ::get_samples_filename(sample_file_id_t id) const { @@ -522,14 +522,14 @@ ::all_gather_packed_lists(lbann_comm& comm) { } m_open_fd_pq.clear(); - size_t num_samples = this->all_gather_field(m_sample_list, per_rank_samples, comm); + size_t num_samples = this->all_gather_field(this->m_sample_list, per_rank_samples, comm); size_t num_ids = this->all_gather_field(my_files, per_rank_files, comm); size_t num_files = this->all_gather_field(m_file_map, per_rank_file_map, comm); - m_sample_list.clear(); + this->m_sample_list.clear(); m_file_id_stats_map.clear(); - m_sample_list.reserve(num_samples); + this->m_sample_list.reserve(num_samples); m_file_id_stats_map.reserve(num_ids); m_file_map.reserve(num_files); @@ -557,10 +557,16 @@ ::all_gather_packed_lists(lbann_comm& comm) { } index = search_result->second; } - m_sample_list.emplace_back(std::make_pair(index, s.second)); + this->m_sample_list.emplace_back(std::make_pair(index, s.second)); } } + if (this->m_keep_order) { + this->reorder(); + } + + // For multi-sample per file case, sample names are read from the sample list + // file. return; } @@ -579,7 +585,7 @@ ::compute_epochs_file_usage(const std::vector& shuffled_indices, m_open_fd_pq.clear(); for (size_t i = 0; i < shuffled_indices.size(); i++) { int idx = shuffled_indices[i]; - const auto& s = m_sample_list[idx]; + const auto& s = this->m_sample_list[idx]; sample_file_id_t index = s.first; if((i % mini_batch_size) % comm.get_procs_per_trainer() == static_cast(comm.get_rank_in_trainer())) { @@ -648,7 +654,7 @@ ::manage_open_file_handles(sample_file_id_t id, bool pre_open_fd) { template inline file_handle_t sample_list_open_files ::open_samples_file_handle(const size_t i, bool pre_open_fd) { - const sample_t& s = m_sample_list[i]; + const sample_t& s = this->m_sample_list[i]; sample_file_id_t id = s.first; file_handle_t h = get_samples_file_handle(id); if (!is_file_handle_valid(h)) { @@ -675,7 +681,7 @@ ::open_samples_file_handle(const size_t i, bool pre_open_fd) { template inline void sample_list_open_files ::close_if_done_samples_file_handle(const size_t i) { - const sample_t& s = m_sample_list[i]; + const sample_t& s = this->m_sample_list[i]; sample_file_id_t id = s.first; auto h = get_samples_file_handle(id); if (!is_file_handle_valid(h)) { diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index df0f1ced1d4..65716e06965 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -48,6 +48,16 @@ namespace lbann { class generic_data_reader; +/** Create a hash function for hashing a std::pair type */ +struct size_t_pair_hash +{ + template + std::size_t operator() (const std::pair &pair) const + { + return std::hash()(pair.first) ^ std::hash()(pair.second); + } +}; + class data_store_conduit { public: @@ -56,6 +66,9 @@ class data_store_conduit { using map_ii_t = std::unordered_map; using map_is_t = std::unordered_map; + // Hash map for tracking the node and hyperslab partition ID + using map_pssi_t = std::unordered_map,int,size_t_pair_hash>; + // not currently used; will be in the future using map_ss_t = std::unordered_map; @@ -113,13 +126,13 @@ class data_store_conduit { //================================================================= // methods for setting and querying the data store's mode //================================================================= - /** @brief Returns true if preloading is turned on + /** @brief Returns true if preloading is turned on * * See notes in: is_explicitly_loading() */ bool is_preloading() const { return m_preloading; } - /** @brief Returns true if explicitly loading is turned on + /** @brief Returns true if explicitly loading is turned on * * 'explicitly loading' means that the data that will be owned * by each rank is passed into the data store during the first epoch. @@ -130,7 +143,7 @@ class data_store_conduit { */ bool is_explicitly_loading() const { return m_explicitly_loading; } - /** @brief Returns true if all loading has been completed + /** @brief Returns true if all loading has been completed * * See notes in: set_loading_is_complete() */ @@ -143,35 +156,34 @@ class data_store_conduit { * but part of the set may be spilled to disk if memory is * insufficient. Local cache mode is activated via the cmd line * flag: --data_store_cache - */ + */ bool is_local_cache() const { return m_is_local_cache; } - /** @brief Turn preloading on or off */ + /** @brief Turn preloading on or off */ void set_is_preloading(bool flag); - /** @brief Turn on explicit loading */ + /** @brief Turn on explicit loading */ void set_is_explicitly_loading(bool flag); /** @brief Marks the data_store as fully loaded * * Fully loaded means that each rank has all the data that it * is intended to own. When not running in local cache mode, this - * occurs (1) at the conclusion of preloading, prior to the beginning of - * the first epoch, or (2) at the conclusion of the first epoch, if - * explicitly loading. When running in local cache mode, this occurs - * (1) at the conclusion of preload_local_cache(), which is called prior + * occurs (1) at the conclusion of preloading, prior to the beginning of + * the first epoch, or (2) at the conclusion of the first epoch, if + * explicitly loading. When running in local cache mode, this occurs + * (1) at the conclusion of preload_local_cache(), which is called prior * to the first epoch, or (2) at the conclusion of exchange_local_caches(), * at th conclusion of the first epoch, if explicitly loading. */ - void set_loading_is_complete(); - + void set_loading_is_complete(); /** @brief turns local cache mode on of off */ void set_is_local_cache(bool flag = true) { m_is_local_cache = flag; } /** @brief Check that explicit loading, preloading, and fully loaded flags are consistent */ void check_query_flags() const; - + //================================================================= // END methods for setting and querying the data store's mode //================================================================= @@ -184,15 +196,23 @@ class data_store_conduit { void build_preloaded_owner_map(const std::vector& per_rank_list_sizes); /// fills in m_owner, which maps index -> owning processor - void set_preloaded_owner_map(const std::unordered_map &owner) { m_owner = owner; } + void set_preloaded_owner_map(const std::unordered_map &owner) { + for(auto&& i : owner) { + m_owner[std::make_pair(i.first, m_offset_in_partition)] = i.second; + } + } /** @brief Special hanling for ras_lipid_conduit_data_reader; may go away in the future */ void clear_owner_map(); - void set_owner_map(const std::unordered_map &m) { m_owner = m; } + void set_owner_map(const std::unordered_map &m) { + for(auto&& i : m) { + m_owner[std::make_pair(i.first, m_offset_in_partition)] = i.second; + } + } /** @brief Special handling for ras_lipid_conduit_data_reader; may go away in the future */ - void add_owner(int data_id, int owner) { m_owner[data_id] = owner; } + void add_owner(int data_id, int owner) { m_owner[std::make_pair(data_id, m_offset_in_partition)] = owner; } /** @brief Special handling for ras_lipid_conduit_data_reader; may go away in the future */ void set_finished_building_map() { m_owner_maps_were_exchanged = true; } @@ -213,7 +233,7 @@ class data_store_conduit { */ void preload_local_cache(); - void exchange_mini_batch_data(size_t current_pos, size_t mb_size); + void exchange_mini_batch_data(size_t current_pos, size_t mb_size); void set_node_sizes_vary() { m_node_sizes_vary = true; } @@ -235,17 +255,17 @@ class data_store_conduit { * * Debug logging is enabled on all ranks via the cmd line flag: --data_store_debug */ - void flush_debug_file(); + void flush_debug_file(); /** @brief Closes then reopens the profile logging file * * Profile logging is enabled on P_0 via the cmd line flag: --data_store_profile */ - void flush_profile_file() const; + void flush_profile_file() const; /** @brief Writes object's state to file */ void write_checkpoint(std::string dir_name); - + /** @brief Loads object's state from file */ void load_checkpoint(std::string dir_name, generic_data_reader *reader = nullptr); @@ -260,7 +280,7 @@ class data_store_conduit { * @param n is the maximum number of samples to test; set to -1 to test all * @return true, if all samples read from file match those constructed from * the local shared memory segment (aka, cache) - */ + */ bool test_local_cache_imagenet(int n); void test_imagenet_node(int sample_id, bool dereference = true); @@ -298,7 +318,7 @@ private : /** @brief Used to form the directory path for spilling conduit nodes */ int m_cur_spill_dir_integer = -1; - /** @brief @brief Current directory for spilling (writing to file) conduit nodes + /** @brief @brief Current directory for spilling (writing to file) conduit nodes * * m_cur_spill_dir = m_spill_dir_base/ */ @@ -360,14 +380,14 @@ private : double m_rebuild_time = 0; // total time for exchange_mini_batch_data - double m_exchange_time = 0; + double m_exchange_time = 0; - // sanity check: + // sanity check: // m_start_snd_rcv_time + m_wait_all_time + m_rebuild_time // should be only slightly less than m_exchange_time; // Note that, for imagenet, the first call to exchange_data_by_sample // involves additional communication for exchanging sample sizes - + //=========================================================== // END: timers for profiling exchange_data //=========================================================== @@ -380,7 +400,7 @@ private : /** @brief True, if we are in preload mode */ bool m_preloading = false; - /** @brief True, if we are in explicit loading mode + /** @brief True, if we are in explicit loading mode * * There is some redundancy here: m_preloading and m_explicitly_loading * can not both be true, but both may be false. When m_loading_is_complete @@ -412,22 +432,35 @@ private : bool m_world_master; bool m_trainer_master; int m_rank_in_trainer; - int m_rank_in_world = -1; // -1 for debugging + int m_rank_in_world = -1; // -1 for debugging + int m_partition_in_trainer; + int m_offset_in_partition; + + /// number of procs in the trainer; convenience handle int m_np_in_trainer; + int m_num_partitions_in_trainer; - /** @brief Maps an index to the processor that owns the associated data */ - map_ii_t m_owner; + /** @brief Maps an index to the processor that owns the associated data + * First value of index is the sample ID and second value is the partiton ID + * + * Must be mutable since rhs.m_owner may be modified in copy_members, + * in which rhs is const. + */ + mutable map_pssi_t m_owner; /// convenience handle const std::vector *m_shuffled_indices; /** @brief Contains the conduit nodes that are "owned" by this rank * - * Maps data_id -> conduit::Node. - */ - std::unordered_map m_data; + * Map data_id -> conduit::Node. + * Must be mutable since rhs.m_owner may be modified in copy_members, + * in which rhs is const. + */ + mutable std::unordered_map m_data; - /** @brief Contains the conduit nodes that are "owned" by this rank + /** @brief Contains a cache of the conduit nodes that are + * "owned" by this rank * * This differs from m_data in that this holds temporarily, * during the first epoch, if we're running in local cache mode @@ -452,11 +485,11 @@ private : std::vector m_outgoing_msg_sizes; std::vector m_incoming_msg_sizes; - /** @brief Maps a data_id to its image size + /** @brief Maps a data_id to its image size * * Used when conduit Nodes have non-uniform size, e.g, imagenet; * see: set_node_sizes_vary() - */ + */ map_is_t m_sample_sizes; /** @brief Maps a data_id to the image location in a shared memory segment */ @@ -472,7 +505,7 @@ private : std::vector> m_indices_to_recv; //========================================================================= - // methods follow + // methods follow //========================================================================= void exchange_data_by_sample(size_t current_pos, size_t mb_size); @@ -512,29 +545,27 @@ private : void compute_image_offsets(map_is_t &image_sizes, std::vector> &indices); /// for use in local cache mode - void exchange_images(std::vector &work, map_is_t &image_sizes, std::vector> &indices); + void exchange_images(std::vector &work, map_is_t &image_sizes, std::vector> &indices); - /// for use in local cache mode void build_conduit_nodes(map_is_t &sizes); - /// for use in local cache mode void fillin_shared_images(char* images, size_t size, size_t offset); /** @brief For testing during development * - * At the beginning of the 2nd epoch, calls write_checkpoint(), - * clears some variables, calls load_checkpoint then continues. + * At the beginning of the 2nd epoch, calls write_checkpoint(), + * clears some variables, calls load_checkpoint then continues. * To activate this test use cmd flag: --data_store_test_checkpoint= - */ + */ void test_checkpoint(const std::string&); /** @brief Called by test_checkpoint */ void print_variables(); - /** @brief Called by test_checkpoint + /** @brief Called by test_checkpoint * - * For testing and development. Prints the first 'n' entries from + * For testing and development. Prints the first 'n' entries from * the owner map * (which maps sample_id -> owning rank) to std::cout */ void print_partial_owner_map(int n); @@ -544,7 +575,7 @@ private : std::string get_metadata_fn() const; /** @brief Creates the directory if it does not already exist */ - void make_dir_if_it_doesnt_exist(const std::string &dir); + void make_dir_if_it_doesnt_exist(const std::string &dir); /** @brief Writes conduit node to file */ void spill_conduit_node(const conduit::Node &node, int data_id); @@ -554,8 +585,8 @@ private : /** @brief Creates directory structure, opens metadata file for output, etc * - * This method is called for both --data_store_spill and - * --data_store_test_checkpoint + * This method is called for both --data_store_spill and + * --data_store_test_checkpoint */ void setup_spill(std::string dir); @@ -572,7 +603,7 @@ private : * files are opened if the cmd flag --data_store_debug is passed. * A profiling file is opened only be * pairs; files are opened if the cmd flag --data_store_profile is passed. - */ + */ void open_informational_files(); /** @brief Creates a directory for spilling conduit nodes */ @@ -591,11 +622,11 @@ private : // functions and templates for optional profiling and debug files follow //========================================================================= - void PROFILE() const { + void PROFILE() const { if (!m_profile) { return; } - (*m_profile) << std::endl; + (*m_profile) << std::endl; flush_profile_file(); } @@ -612,11 +643,11 @@ private : flush_profile_file(); } - void DEBUG_DS() { + void DEBUG_DS() { if (!m_debug) { return; } - (*m_debug) << std::endl; + (*m_debug) << std::endl; flush_debug_file(); } diff --git a/include/lbann/io/data_buffers/generic_io_buffer.hpp b/include/lbann/io/data_buffers/generic_io_buffer.hpp index a8d4f7ecec0..440742d698f 100644 --- a/include/lbann/io/data_buffers/generic_io_buffer.hpp +++ b/include/lbann/io/data_buffers/generic_io_buffer.hpp @@ -55,13 +55,14 @@ class fetch_data_functor { case data_reader_target_mode::NA: throw lbann_exception("Invalid data reader target mode"); case data_reader_target_mode::CLASSIFICATION: + case data_reader_target_mode::LABEL_RECONSTRUCTION: default: num_responses_fetched = data_reader->fetch_labels(responses); } if(num_samples_fetched != num_responses_fetched) { - std::string err = std::string("Number of samples: ") + std::to_string(num_samples_fetched) - + std::string(" does not match the number of responses: ") + std::to_string(num_responses_fetched); - throw lbann_exception(err); + LBANN_ERROR("Number of samples (",num_samples_fetched,") ", + "does not match the ", + "number of responses (",num_responses_fetched,")"); } return num_samples_fetched; } @@ -73,6 +74,7 @@ class fetch_data_functor { case data_reader_target_mode::REGRESSION: case data_reader_target_mode::RECONSTRUCTION: case data_reader_target_mode::CLASSIFICATION: + case data_reader_target_mode::LABEL_RECONSTRUCTION: default: throw lbann_exception("Invalid data reader target mode"); } diff --git a/include/lbann/layers/activations/identity.hpp b/include/lbann/layers/activations/identity.hpp index ff59d2138dd..ec73dec5e08 100644 --- a/include/lbann/layers/activations/identity.hpp +++ b/include/lbann/layers/activations/identity.hpp @@ -78,7 +78,7 @@ class identity_layer : public data_type_layer { bool is_distconv_supported() const override { return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL; } - void setup_distconv_adapter() override { + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { this->get_distconv_adapter_ptr() = make_unique>(*this); } diff --git a/include/lbann/layers/activations/leaky_relu.hpp b/include/lbann/layers/activations/leaky_relu.hpp index b936a5ac1b9..20e871fd030 100644 --- a/include/lbann/layers/activations/leaky_relu.hpp +++ b/include/lbann/layers/activations/leaky_relu.hpp @@ -96,7 +96,7 @@ class leaky_relu_layer : public data_type_layer { bool is_distconv_supported() const override { return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL; } - void setup_distconv_adapter() override { + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { this->get_distconv_adapter_ptr() = make_unique>(*this); } diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp index f95c663ac86..f4d0996bc36 100644 --- a/include/lbann/layers/activations/relu.hpp +++ b/include/lbann/layers/activations/relu.hpp @@ -65,7 +65,7 @@ class relu_layer : public data_type_layer { bool is_distconv_supported() const override { return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL; } - void setup_distconv_adapter() override { + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { this->get_distconv_adapter_ptr() = make_unique>(*this); } diff --git a/include/lbann/layers/activations/softmax.hpp b/include/lbann/layers/activations/softmax.hpp index 0a3a4c9917a..0bf855d914d 100644 --- a/include/lbann/layers/activations/softmax.hpp +++ b/include/lbann/layers/activations/softmax.hpp @@ -189,7 +189,7 @@ class softmax_layer : public data_type_layer { bool is_distconv_supported() const override { return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL; } - void setup_distconv_adapter() override { + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { this->get_distconv_adapter_ptr() = make_unique>(*this); } diff --git a/include/lbann/layers/data_type_distconv_adapter.hpp b/include/lbann/layers/data_type_distconv_adapter.hpp index a120965ad67..f483bf9f0c6 100644 --- a/include/lbann/layers/data_type_distconv_adapter.hpp +++ b/include/lbann/layers/data_type_distconv_adapter.hpp @@ -156,6 +156,8 @@ class data_type_distconv_adapter: public distconv_adapter { void set_activations_outermost_dimension(size_t dim); void set_error_signals_outermost_dimension(size_t dim); + + size_t get_max_mini_batch_size() const; }; } // namespace lbann diff --git a/include/lbann/layers/data_type_layer.hpp b/include/lbann/layers/data_type_layer.hpp index 2c363ccef21..c79d0e63b32 100644 --- a/include/lbann/layers/data_type_layer.hpp +++ b/include/lbann/layers/data_type_layer.hpp @@ -365,7 +365,7 @@ class data_type_layer : public Layer { const data_type_distconv_adapter& get_distconv_adapter() const override; protected: - void setup_distconv_adapter() override; + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override; #endif // LBANN_HAS_DISTCONV #ifdef LBANN_HAS_CUDA diff --git a/include/lbann/layers/io/input/input_layer.hpp b/include/lbann/layers/io/input/input_layer.hpp index 35bdee54a92..87b9fc8f88f 100644 --- a/include/lbann/layers/io/input/input_layer.hpp +++ b/include/lbann/layers/io/input/input_layer.hpp @@ -47,7 +47,7 @@ class input_distconv_adapter: public data_type_distconv_adapter using TensorHost = dc::TensorHost; using TensorHostShuffler = dc::TensorHostShuffler; - input_distconv_adapter(Layer& layer); + input_distconv_adapter(Layer& layer, const bool shuffle_required); virtual ~input_distconv_adapter() = default; TensorHostShuffler &get_shuffler(const TensorHost &src, const TensorHost &dst, @@ -78,7 +78,7 @@ class input_distconv_adapter: public data_type_distconv_adapter std::vector> m_original_host_tensors; std::vector> m_host_tensors; - bool m_shuffle_required; + const bool m_shuffle_required; std::vector, 4>> m_shufflers; std::unique_ptr m_shuffler_src_buf; size_t m_shuffler_src_buf_size = 0; @@ -142,8 +142,9 @@ class input_layer : public generic_input_layer { bool is_distconv_supported() const override { return Dev == El::Device::CPU && T_layout == data_layout::DATA_PARALLEL; } - void setup_distconv_adapter() override { - this->get_distconv_adapter_ptr() = make_unique(*this); + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { + this->get_distconv_adapter_ptr() = make_unique( + *this, dr_metadata.shuffle_required); } distconv_adapter_type& get_distconv_adapter() override; const distconv_adapter_type& get_distconv_adapter() const override; diff --git a/include/lbann/layers/layer.hpp b/include/lbann/layers/layer.hpp index 9778e2f433f..f43ee4c0f76 100644 --- a/include/lbann/layers/layer.hpp +++ b/include/lbann/layers/layer.hpp @@ -764,8 +764,8 @@ class Layer { /** Indicate whether distconv is supported. */ virtual bool is_distconv_supported() const { return false; } /** Pre-initialize distconv attributes needed for setup_data(). */ - void prepare_distconv(); - virtual void setup_distconv_adapter() = 0; + void prepare_distconv(const DataReaderMetaData& dr_metadata); + virtual void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) = 0; std::unique_ptr& get_distconv_adapter_ptr() { return m_dc; }; const std::unique_ptr& get_distconv_adapter_ptr() const { diff --git a/include/lbann/layers/learning/CMakeLists.txt b/include/lbann/layers/learning/CMakeLists.txt index 71111d57435..a5f96c38e62 100644 --- a/include/lbann/layers/learning/CMakeLists.txt +++ b/include/lbann/layers/learning/CMakeLists.txt @@ -9,6 +9,7 @@ set_full_path(THIS_DIR_HEADERS entrywise_scale_bias.hpp fully_connected.hpp fully_connected_cuda.hpp + gru.hpp learning.hpp ) diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp index 5f15e935ee0..252314ec3b1 100644 --- a/include/lbann/layers/learning/base_convolution.hpp +++ b/include/lbann/layers/learning/base_convolution.hpp @@ -256,7 +256,7 @@ class base_convolution_layer : public data_type_layer { friend class base_convolution_adapter; protected: using BaseConvAdapterType = base_convolution_adapter; - void setup_distconv_adapter() override; + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override; BaseConvAdapterType& get_distconv_adapter() override; const BaseConvAdapterType& get_distconv_adapter() const override; #endif // LBANN_HAS_DISTCONV diff --git a/include/lbann/layers/learning/convolution.hpp b/include/lbann/layers/learning/convolution.hpp index 19fb2daf248..8980ee32c8e 100644 --- a/include/lbann/layers/learning/convolution.hpp +++ b/include/lbann/layers/learning/convolution.hpp @@ -115,7 +115,7 @@ class convolution_layer #ifdef LBANN_HAS_DISTCONV friend class convolution_distconv_adapter; protected: - void setup_distconv_adapter() override; + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override; bool is_distconv_supported() const override; #endif // LBANN_HAS_DISTCONV }; diff --git a/include/lbann/layers/learning/deconvolution.hpp b/include/lbann/layers/learning/deconvolution.hpp index 6ebce704f9b..7977065655a 100644 --- a/include/lbann/layers/learning/deconvolution.hpp +++ b/include/lbann/layers/learning/deconvolution.hpp @@ -104,7 +104,7 @@ class deconvolution_layer : public base_convolution_layer; protected: - void setup_distconv_adapter() override; + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override; bool is_distconv_supported() const override; #endif // LBANN_HAS_DISTCONV }; diff --git a/include/lbann/layers/learning/gru.hpp b/include/lbann/layers/learning/gru.hpp new file mode 100644 index 00000000000..562a782f626 --- /dev/null +++ b/include/lbann/layers/learning/gru.hpp @@ -0,0 +1,130 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_LEARNING_GRU_HPP_INCLUDED +#define LBANN_LAYERS_LEARNING_GRU_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" +#ifdef LBANN_HAS_CUDNN +#include "lbann/utils/cudnn.hpp" +#endif // LBANN_HAS_CUDNN + +namespace lbann { + +/** @brief Gated recurrent unit + * + * Expects two inputs: a 2D input sequence ( + * @f$ \text{sequence\_length}\times\text{input\_size} @f$ ) + * and a 1D initial hidden state ( @f$ \text{hidden\_size} @f$ ). + * + * Uses four weights: "ih\_matrix" ( + * @f$ 3 \text{hidden\_size}\times\text{input\_size} @f$ ), + * "hh\_matrix" ( + * @f$ 3 \text{hidden\_size}\times\text{hidden\_size} @f$ ), + * "ih_bias" ( @f$ 3 \text{hidden\_size} @f$ ), + * "hh_bias" ( @f$ 3 \text{hidden\_size} @f$ ). + * + * @todo Support CPU + * @todo Support bidirectional RNNs + * @todo Support stacked RNNs + * + * @warning cuDNN 8 exposes a new RNN API and deprecates the old one. + * Consider reimplementing this layer once cuDNN 8 is the minimum + * version. + */ +template +class gru_layer + : public data_type_layer { + + static_assert(Layout == data_layout::DATA_PARALLEL, + "GRU layer only supports data parallel layout"); + +public: + + gru_layer( + lbann_comm* comm, + size_t hidden_size); + + gru_layer(const gru_layer& other); + gru_layer& operator=(const gru_layer& other); + ~gru_layer() = default; + + gru_layer* copy() const override; + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + description get_description() const override; + +protected: + + void setup_dims(DataReaderMetaData& dr_metadata) override; + void setup_data(size_t max_mini_batch_size) override; +#ifdef LBANN_HAS_CUDNN + void setup_gpu() override; +#endif // LBANN_HAS_CUDNN + + void fp_compute() override; + void bp_compute() override; + +private: + + size_t m_hidden_size; + +#ifdef LBANN_HAS_CUDNN + using ByteBuffer = hydrogen::simple_buffer; + cudnn::RNNDescriptor m_rnn_cudnn_desc; + cudnn::TensorDescriptor m_input_cudnn_desc; + cudnn::TensorDescriptor m_output_cudnn_desc; + cudnn::TensorDescriptor m_hidden_cudnn_desc; + cudnn::FilterDescriptor m_packed_weights_cudnn_desc; + ByteBuffer m_cudnn_reserve_space; +#endif // LBANN_HAS_CUDNN + + template + friend void fp_compute_impl(gru_layer&); + template + friend void bp_compute_impl(gru_layer&); + +}; + +// Builder function +LBANN_DEFINE_LAYER_BUILDER(gru); + +// Explicit template instantiation +#ifdef LBANN_HAS_CUDNN +#ifndef LBANN_GRU_LAYER_INSTANTIATE +#define PROTO(T) \ + extern template class gru_layer< \ + T, data_layout::DATA_PARALLEL, El::Device::GPU>; +#define LBANN_INSTANTIATE_CPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#endif // LBANN_GRU_LAYER_INSTANTIATE +#endif // LBANN_HAS_CUDNN + +} // namespace lbann + +#endif // LBANN_LAYERS_LEARNING_GRU_HPP_INCLUDED diff --git a/include/lbann/layers/loss/cross_entropy.hpp b/include/lbann/layers/loss/cross_entropy.hpp index 238bda958a4..0f44d88d04e 100644 --- a/include/lbann/layers/loss/cross_entropy.hpp +++ b/include/lbann/layers/loss/cross_entropy.hpp @@ -37,7 +37,9 @@ template class cross_entropy_distconv_adapter: public data_type_distconv_adapter { public: using TensorDevType = typename data_type_distconv_adapter::TensorDevType; - cross_entropy_distconv_adapter(Layer& layer): data_type_distconv_adapter(layer) {} + cross_entropy_distconv_adapter(Layer& layer, bool use_labels) + : data_type_distconv_adapter(layer), + m_use_labels(use_labels){} virtual ~cross_entropy_distconv_adapter() = default; void setup_distributions(tensor_overlap_constraints &constraints) override; dc::Shape get_prev_activations_shape(int index) const override; @@ -45,6 +47,7 @@ class cross_entropy_distconv_adapter: public data_type_distconv_adapter m_cross_entropy; + bool m_use_labels; }; #endif // LBANN_HAS_DISTCONV @@ -67,12 +70,14 @@ class cross_entropy_layer : public data_type_layer { public: - cross_entropy_layer(lbann_comm *comm) : data_type_layer(comm) { + cross_entropy_layer(lbann_comm *comm, bool use_labels) + : data_type_layer(comm), + m_use_labels(use_labels) { this->m_expected_num_parent_layers = 2; } cross_entropy_layer(const cross_entropy_layer& other) - : data_type_layer(other) { + : data_type_layer(other), m_use_labels(other.m_use_labels) { m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); @@ -80,6 +85,7 @@ class cross_entropy_layer : public data_type_layer { cross_entropy_layer& operator=(const cross_entropy_layer& other) { data_type_layer::operator=(other); + m_use_labels = other.m_use_labels; m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); @@ -160,8 +166,16 @@ class cross_entropy_layer : public data_type_layer { if (this->distconv_enabled()) { fp_compute_distconv(); return; + } else { + if(m_use_labels) { + LBANN_ERROR("Cross-entropy layers without Distconv don't support use_labels."); + } } -#endif +#else // LBANN_HAS_DISTCONV + if(m_use_labels) { + LBANN_ERROR("Cross-entropy layers without Distconv don't support use_labels."); + } +#endif // LBANN_HAS_DISTCONV // Initialize workspace const auto& prediction = this->get_prev_activations(0); @@ -182,6 +196,14 @@ class cross_entropy_layer : public data_type_layer { if (this->distconv_enabled()) { bp_compute_distconv(); return; + } else { + if(m_use_labels) { + LBANN_ERROR("Cross-entropy layers without Distconv don't support use_labels."); + } + } +#else // LBANN_HAS_DISTCONV + if(m_use_labels) { + LBANN_ERROR("Cross-entropy layers without Distconv don't support use_labels."); } #endif // LBANN_HAS_DISTCONV @@ -201,6 +223,9 @@ class cross_entropy_layer : public data_type_layer { /** Compute local gradients. */ void local_bp_compute(); + /** Use interger label tensors as ground-truth. */ + bool m_use_labels; + /** Workspace matrix. */ std::unique_ptr m_workspace; @@ -211,9 +236,9 @@ class cross_entropy_layer : public data_type_layer { return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL; } - void setup_distconv_adapter() override { + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { this->get_distconv_adapter_ptr() = make_unique< - cross_entropy_distconv_adapter>(*this); + cross_entropy_distconv_adapter>(*this, m_use_labels); } cross_entropy_distconv_adapter& get_distconv_adapter() override; @@ -323,7 +348,7 @@ setup_distributions(tensor_overlap_constraints &constraints) { template void cross_entropy_distconv_adapter::setup_layer( size_t workspace_capacity) { - m_cross_entropy = make_unique(dc::get_backend()); + m_cross_entropy = make_unique(dc::get_backend(), m_use_labels); m_cross_entropy->setup(this->get_prev_activations(0), this->get_prev_activations(1), this->get_activations(0)); diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp index 4c5a3013eed..c389567ca98 100644 --- a/include/lbann/layers/regularizers/batch_normalization.hpp +++ b/include/lbann/layers/regularizers/batch_normalization.hpp @@ -386,7 +386,7 @@ class batch_normalization_layer : public regularizer_layer { bool is_distconv_supported() const override { return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL; } - void setup_distconv_adapter() override { + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { this->get_distconv_adapter_ptr() = make_unique< batch_normalization_distconv_adapter>(*this); } diff --git a/include/lbann/layers/transform/concatenate.hpp b/include/lbann/layers/transform/concatenate.hpp index 2b3e5091436..f7306baa9cd 100644 --- a/include/lbann/layers/transform/concatenate.hpp +++ b/include/lbann/layers/transform/concatenate.hpp @@ -114,7 +114,7 @@ class concatenate_layer : public data_type_layer { return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL && m_concat_dim == 0; } - void setup_distconv_adapter() override { + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { this->get_distconv_adapter_ptr() = make_unique< concatenate_distconv_adapter>(*this); } @@ -377,7 +377,7 @@ fp_compute() { dc::tensor::Concatenate(this->get_activations(0), this->get_prev_activations(0), this->get_prev_activations(1), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); } template @@ -386,7 +386,7 @@ bp_compute() { dc::tensor::Slice(this->get_error_signals(0), this->get_error_signals(1), this->get_prev_error_signals(0), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); } #endif // LBANN_HAS_DISTCONV diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp index 35db88a633c..176fc26abd0 100644 --- a/include/lbann/layers/transform/pooling.hpp +++ b/include/lbann/layers/transform/pooling.hpp @@ -542,7 +542,7 @@ class pooling_layer : public transform_layer { friend class pooling_distconv_adapter; protected: bool is_distconv_supported() const override; - void setup_distconv_adapter() override { + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { this->get_distconv_adapter_ptr() = make_unique< pooling_distconv_adapter>(*this); } diff --git a/include/lbann/layers/transform/split.hpp b/include/lbann/layers/transform/split.hpp index 90248ce99a8..5cdeedf681f 100644 --- a/include/lbann/layers/transform/split.hpp +++ b/include/lbann/layers/transform/split.hpp @@ -106,7 +106,7 @@ class split_layer : public transform_layer { bool is_distconv_supported() const override { return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL; } - void setup_distconv_adapter() override { + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { this->get_distconv_adapter_ptr() = make_unique>(*this); } diff --git a/include/lbann/layers/transform/sum.hpp b/include/lbann/layers/transform/sum.hpp index 7786f72f634..6d7c11884ff 100644 --- a/include/lbann/layers/transform/sum.hpp +++ b/include/lbann/layers/transform/sum.hpp @@ -129,7 +129,7 @@ class sum_layer : public transform_layer { bool is_distconv_supported() const override { return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL; } - void setup_distconv_adapter() override { + void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override { this->get_distconv_adapter_ptr() = make_unique>(*this); } sum_distconv_adapter& get_distconv_adapter() override; diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index a9dbdf2c553..109782a3ab1 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -54,6 +54,7 @@ #include "lbann/layers/learning/embedding.hpp" #include "lbann/layers/learning/channelwise_scale_bias.hpp" #include "lbann/layers/learning/entrywise_scale_bias.hpp" +#include "lbann/layers/learning/gru.hpp" /// Loss layers #include "lbann/layers/loss/categorical_accuracy.hpp" @@ -136,6 +137,9 @@ #include "lbann/data_readers/data_reader_pilot2_molecular.hpp" #include "lbann/data_readers/data_reader_mesh.hpp" #include "lbann/data_readers/data_reader_python.hpp" +#ifdef LBANN_HAS_DISTCONV +#include "lbann/data_readers/data_reader_hdf5.hpp" +#endif // LBANN_HAS_DISTCONV #include "lbann/data_readers/data_reader_smiles.hpp" /// Data stores diff --git a/include/lbann/models/model.hpp b/include/lbann/models/model.hpp index de30914d0c8..10cfa4896c1 100644 --- a/include/lbann/models/model.hpp +++ b/include/lbann/models/model.hpp @@ -415,6 +415,11 @@ class model { /** @brief Execute callbacks at the end of weight optimization. */ virtual void do_weight_optimize_end_cbs(weights *w); +#ifdef LBANN_HAS_DISTCONV + /* @brief Return the maximum mini-batch size used by Distconv. */ + size_t get_max_mini_batch_size_distconv() const { return m_max_mini_batch_size_distconv; } +#endif + private: /** Pointer to the execution context object used for training or evaluating this model */ @@ -504,6 +509,12 @@ class model { void setup_distconv(); void setup_distributions(); void print_distributions() const; + + /** @brief The maximum mini-batch size used by Distconv. + * @details This should be set before setup_distconv() is called. + */ + size_t m_max_mini_batch_size_distconv; + #endif // LBANN_HAS_DISTCONV }; diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp index 8bb4d50fc25..9af9c0df67f 100644 --- a/include/lbann/proto/proto_common.hpp +++ b/include/lbann/proto/proto_common.hpp @@ -46,19 +46,19 @@ class Trainer; namespace lbann { -/** @brief Customize the name of the index list +/** @brief Customize the name of the sample list * * The following options are available * - trainer ID * - model name * * The format for the naming convention if the provided name is - * \ is: + * \ is: * @verbatim - == . + == . _t_. @endverbatim */ -void customize_data_readers_index_list(const lbann_comm& comm, +void customize_data_readers_sample_list(const lbann_comm& comm, ::lbann_data::LbannPB& p); /** @brief instantiates one or more generic_data_readers and inserts diff --git a/include/lbann/utils/cuda.hpp b/include/lbann/utils/cuda.hpp index d124487df3e..de2cfb213fa 100644 --- a/include/lbann/utils/cuda.hpp +++ b/include/lbann/utils/cuda.hpp @@ -167,6 +167,9 @@ template __device__ __forceinline__ T tanh(const T& x); template __device__ __forceinline__ T acosh(const T& x); template __device__ __forceinline__ T asinh(const T& x); template __device__ __forceinline__ T atanh(const T& x); +template __device__ __forceinline__ bool isfinite(const T& x); +template __device__ __forceinline__ bool isinf(const T& x); +template __device__ __forceinline__ bool isnan(const T& x); // Binary math functions template __device__ __forceinline__ T min(const T& x, const T& y); @@ -222,8 +225,9 @@ class event_wrapper { }; // ------------------------------------------------------------- -// Helper functions for entrywise operations +// Helper functions for tensor operations // ------------------------------------------------------------- + #ifdef __CUDACC__ /** Apply an entry-wise unary operator to GPU data. @@ -267,6 +271,16 @@ void apply_entrywise_binary_operator( #endif // __CUDACC__ +/** Copy entries between GPU tensors. */ +template +void copy_tensor( + cudaStream_t stream, + const std::vector& dims, + const TensorDataType* input, + const std::vector& input_strides, + TensorDataType* output, + const std::vector& output_strides); + // ------------------------------------------------------------- // Utilities for Thrust // ------------------------------------------------------------- @@ -303,7 +317,7 @@ class allocator typedef typename parent_class::system_type system_type; /** Default constructor. */ - allocator(cudaStream_t stream = El::GPUManager::Stream()); + allocator(cudaStream_t stream = hydrogen::cuda::GetDefaultStream()); /** Allocate GPU buffer. */ pointer allocate(size_type size); /** Deallocate GPU buffer. diff --git a/include/lbann/utils/cudnn.hpp b/include/lbann/utils/cudnn.hpp index 355d644805e..2d0b76a657f 100644 --- a/include/lbann/utils/cudnn.hpp +++ b/include/lbann/utils/cudnn.hpp @@ -143,6 +143,229 @@ void copy_tensor_desc(const cudnnTensorDescriptor_t& src, void copy_activation_desc(const cudnnActivationDescriptor_t& src, cudnnActivationDescriptor_t& dst); +//////////////////////////////////////////////////////////// +// Wrapper classes for cuDNN types +//////////////////////////////////////////////////////////// + +/** @brief Wrapper around @c cudnnTensorDescriptor_t */ +class TensorDescriptor { + +public: + + TensorDescriptor(cudnnTensorDescriptor_t desc=nullptr); + template + TensorDescriptor(ArgTs... args) { + set(args...); + } + + ~TensorDescriptor(); + + // Copy-and-swap idiom + TensorDescriptor(const TensorDescriptor&); + TensorDescriptor(TensorDescriptor&&); + TensorDescriptor& operator=(TensorDescriptor); + friend void swap(TensorDescriptor& first, TensorDescriptor& second); + + /** @brief Take ownership of cuDNN object */ + void reset(cudnnTensorDescriptor_t desc=nullptr); + /** @brief Return cuDNN object and release ownership */ + cudnnTensorDescriptor_t release(); + /** @brief Return cuDNN object without releasing ownership */ + cudnnTensorDescriptor_t get() const noexcept; + /** @brief Return cuDNN object without releasing ownership */ + operator cudnnTensorDescriptor_t() const noexcept; + + /** @brief Create cuDNN object + * + * Does nothing if already created. + */ + void create(); + /** @brief Configure cuDNN object + * + * Creates cuDNN object if needed. + */ + void set( + cudnnDataType_t data_type, + const std::vector& dims, + std::vector strides = {}); + /** @brief Configure cuDNN object + * + * Creates cuDNN object if needed. + */ + template + void set( + cudnnDataType_t data_type, + IntTs... dims) { + set(data_type, {static_cast(dims)...}); + } + +private: + + cudnnTensorDescriptor_t desc_{nullptr}; + +}; + +/** Wrapper around @c cudnnFilterDescriptor_t */ +class FilterDescriptor { + +public: + + FilterDescriptor(cudnnFilterDescriptor_t desc=nullptr); + template + FilterDescriptor(ArgTs... args) { + set(args...); + } + + ~FilterDescriptor(); + + // Copy-and-swap idiom + FilterDescriptor(const FilterDescriptor&); + FilterDescriptor(FilterDescriptor&&); + FilterDescriptor& operator=(FilterDescriptor); + friend void swap(FilterDescriptor& first, FilterDescriptor& second); + + /** @brief Take ownership of cuDNN object */ + void reset(cudnnFilterDescriptor_t desc=nullptr); + /** @brief Return cuDNN object and release ownership */ + cudnnFilterDescriptor_t release(); + /** @brief Return cuDNN object without releasing ownership */ + cudnnFilterDescriptor_t get() const noexcept; + /** @brief Return cuDNN object without releasing ownership */ + operator cudnnFilterDescriptor_t() const noexcept; + + /** Create cuDNN object + * + * Does nothing if already created. + */ + void create(); + /** Configure cuDNN object + * + * Creates cuDNN object if needed. + */ + void set( + cudnnDataType_t data_type, + cudnnTensorFormat_t format, + const std::vector& dims); + /** Configure cuDNN object + * + * Creates cuDNN object if needed. + */ + template + void set( + cudnnDataType_t data_type, + cudnnTensorFormat_t format, + IntTs... dims) { + set(data_type, format, {static_cast(dims)...}); + } + +private: + + cudnnFilterDescriptor_t desc_{nullptr}; + +}; + +/** Wrapper around @c cudnnDropoutDescriptor_t */ +class DropoutDescriptor { + +public: + + DropoutDescriptor(cudnnDropoutDescriptor_t desc=nullptr); + template + DropoutDescriptor(ArgTs... args) { + set(args...); + } + + ~DropoutDescriptor(); + + // Copy-and-swap idiom + DropoutDescriptor(const DropoutDescriptor&); + DropoutDescriptor(DropoutDescriptor&&); + DropoutDescriptor& operator=(DropoutDescriptor); + friend void swap(DropoutDescriptor& first, DropoutDescriptor& second); + + /** @brief Take ownership of cuDNN object */ + void reset(cudnnDropoutDescriptor_t desc=nullptr); + /** @brief Return cuDNN object and release ownership */ + cudnnDropoutDescriptor_t release(); + /** @brief Return cuDNN object without releasing ownership */ + cudnnDropoutDescriptor_t get() const noexcept; + /** @brief Return cuDNN object without releasing ownership */ + operator cudnnDropoutDescriptor_t() const noexcept; + + /** Create cuDNN object + * + * Does nothing if already created. + */ + void create(); + /** Configure cuDNN object + * + * Creates cuDNN object if needed. + */ + void set( + float dropout, + void* states, + size_t states_size, + unsigned long long seed); + +private: + + cudnnDropoutDescriptor_t desc_{nullptr}; + +}; + +/** Wrapper around @c cudnnRNNDescriptor_t */ +class RNNDescriptor { + +public: + + RNNDescriptor(cudnnRNNDescriptor_t desc=nullptr); + template + RNNDescriptor(ArgTs... args) { + set(args...); + } + + ~RNNDescriptor(); + + // Copy-and-swap idiom + RNNDescriptor(const RNNDescriptor&); + RNNDescriptor(RNNDescriptor&&); + RNNDescriptor& operator=(RNNDescriptor); + friend void swap(RNNDescriptor& first, RNNDescriptor& second); + + /** @brief Take ownership of cuDNN object */ + void reset(cudnnRNNDescriptor_t desc=nullptr); + /** @brief Return cuDNN object and release ownership */ + cudnnRNNDescriptor_t release(); + /** @brief Return cuDNN object without releasing ownership */ + cudnnRNNDescriptor_t get() const noexcept; + /** @brief Return cuDNN object without releasing ownership */ + operator cudnnRNNDescriptor_t() const noexcept; + + /** Create cuDNN object + * + * Does nothing if already created. + */ + void create(); + /** Configure cuDNN object + * + * Creates cuDNN object if needed. + */ + void set( + size_t hidden_size, + size_t num_layers, + cudnnDropoutDescriptor_t dropout_desc, + cudnnRNNInputMode_t input_mode, + cudnnDirectionMode_t direction, + cudnnRNNMode_t mode, + cudnnRNNAlgo_t algo, + cudnnDataType_t math_precision); + +private: + + cudnnRNNDescriptor_t desc_{nullptr}; + +}; + //////////////////////////////////////////////////////////// // cuDNN tensor managers //////////////////////////////////////////////////////////// diff --git a/include/lbann/utils/file_utils.hpp b/include/lbann/utils/file_utils.hpp index 53f9c9b6be0..3b544b48227 100644 --- a/include/lbann/utils/file_utils.hpp +++ b/include/lbann/utils/file_utils.hpp @@ -70,7 +70,7 @@ bool check_if_dir_exists(const std::string& dirname); /** @todo Deprecated. Use @c lbann::file::make_directory instead. */ bool create_dir(const std::string output_dir); -bool load_file(const std::string filename, std::vector& buf); +bool load_file(const std::string filename, std::vector& buf, bool append = false); inline void __swapEndianInt(unsigned int& ui) { ui = ((ui >> 24) | ((ui<<8) & 0x00FF0000) | ((ui>>8) & 0x0000FF00) | (ui << 24)); diff --git a/include/lbann/utils/impl/cuda.hpp b/include/lbann/utils/impl/cuda.hpp index 8fa2bb79ff9..2517c5ea7b3 100644 --- a/include/lbann/utils/impl/cuda.hpp +++ b/include/lbann/utils/impl/cuda.hpp @@ -178,14 +178,16 @@ WRAP_UNARY_CUDA_MATH_FUNCTION(atanh) template __device__ __forceinline__ bool isfinite(T const& x) { return ::isfinite(x); } - +template __device__ __forceinline__ +bool isinf(T const& x) { return ::isinf(x); } template __device__ __forceinline__ bool isnan(T const& x) { return ::isnan(x); } #if __CUDA_ARCH__ >= 530 template <> __device__ __forceinline__ bool isfinite(__half const& x) { return !(::__isnan(x) || ::__hisinf(x)); } - +template <> __device__ __forceinline__ +bool isinf(__half const& x) { return ::__hisinf(x); } template <> __device__ __forceinline__ bool isnan(__half const& x) { return ::__hisnan(x); } @@ -443,9 +445,9 @@ void apply_entrywise_unary_operator( // Launch CUDA kernel if (grid_dim > 0) { - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); entrywise_unary_operator_kernel - <<>>( + <<>>( height, width, input.LockedBuffer(), input.LDim(), output.Buffer(), output.LDim()); } @@ -493,9 +495,9 @@ void apply_entrywise_binary_operator( // Launch CUDA kernel if (grid_dim > 0) { - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); entrywise_binary_operator_kernel - <<>>( + <<>>( height, width, input1.LockedBuffer(), input1.LDim(), input2.LockedBuffer(), input2.LDim(), diff --git a/include/lbann/weights/weights_proxy.hpp b/include/lbann/weights/weights_proxy.hpp index 989bf79c6c7..81316abeb51 100644 --- a/include/lbann/weights/weights_proxy.hpp +++ b/include/lbann/weights/weights_proxy.hpp @@ -262,8 +262,14 @@ class WeightsProxy */ void synchronize_with_master() { - if (!empty() && !values_->Viewing()) { - El::Copy(master_weights_->get_values(), *values_); + if (!empty()) { + const auto& master_values = master_weights_->get_values(); + if (values_->Viewing()) { + El::LockedView(*values_, dynamic_cast(master_values)); + } + else { + El::Copy(master_values, *values_); + } } } diff --git a/model_zoo/data_readers/data_reader_jag.prototext b/model_zoo/data_readers/data_reader_jag.prototext index 6c5dc722528..e6218fa3686 100644 --- a/model_zoo/data_readers/data_reader_jag.prototext +++ b/model_zoo/data_readers/data_reader_jag.prototext @@ -14,10 +14,9 @@ data_reader { name: "jag_conduit" role: "train" shuffle: true - data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/" - index_list: "100Kindex.txt" - index_list_per_trainer: false - index_list_per_model: false + sample_list: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/100Kindex.txt" + sample_list_per_trainer: false + sample_list_per_model: false validation_percent: 0 absolute_sample_count: 0 @@ -34,10 +33,9 @@ data_reader { role: "test" shuffle: false # change to a lustre path - data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/" - index_list: "t1_sample_list.txt" - index_list_per_trainer: false - index_list_per_model: false + sample_list: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/t1_sample_list.txt" + sample_list_per_trainer: false + sample_list_per_model: false validation_percent: 0 absolute_sample_count: 0 diff --git a/python/lbann/contrib/launcher.py b/python/lbann/contrib/launcher.py index 775e79a7408..b66f80fd051 100644 --- a/python/lbann/contrib/launcher.py +++ b/python/lbann/contrib/launcher.py @@ -30,6 +30,20 @@ def is_nersc_center(): """ return bool(os.getenv('NERSC_HOST')) +def is_olcf_center(): + """Current system is operated by the Oak Ridge Leadership + Computing Facility at Oak Ridge National Laboratory. + + Checks whether the domain name ends with ".ornl.gov". + Checks whether the environment variable OLCF_MODULEPATH_ROOT is set. + + """ + domain = socket.getfqdn().split('.') + return (len(domain) > 2 + and domain[-2] == 'ornl' + and domain[-1] == 'gov') +# return bool(os.getenv('OLCF_MODULEPATH_ROOT')) + # Detect compute center and choose launcher _center = 'unknown' launcher = lbann.launcher @@ -41,6 +55,10 @@ def is_nersc_center(): _center = 'nersc' import lbann.contrib.nersc.launcher launcher = lbann.contrib.nersc.launcher +elif is_olcf_center(): + _center = 'olcf' + import lbann.contrib.olcf.launcher + launcher = lbann.contrib.olcf.launcher def compute_center(): """Name of organization that operates current system.""" diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index b31b6a7b763..9aaf54c9598 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -79,7 +79,7 @@ def set_environment(key, default): set_environment('MV2_USE_RDMA_CM', 0) # Optimizations for Sierra-like systems - if system in ('sierra', 'lassen'): + if system in ('sierra', 'lassen', 'rzansel'): # Set thread affinity # Note: Aluminum's default thread affinity is incorrect since diff --git a/python/lbann/contrib/lc/paths.py b/python/lbann/contrib/lc/paths.py index fecbf1dec6a..939eb39cc9e 100644 --- a/python/lbann/contrib/lc/paths.py +++ b/python/lbann/contrib/lc/paths.py @@ -106,3 +106,50 @@ def imagenet_labels(system = system(), data_set = 'train', return os.path.join(label_dir, 'test.txt') else: raise RuntimeError('unknown ImageNet data set (' + data_set + ')') + +def imagenet_sample_list(system = system(), data_set = 'train', + num_classes = 1000): + """ImageNet sample_list file on LC system. + + The file contains ground truth labels from the ILSVRC2012 + competition. It is a plain text file where each line contains an + image file path (relative to the ImageNet directory; see the + `imagenet_dir` function) and the corresponding label ID. + + There are three available data sets: 'training', 'validation', and + 'testing'. + + Some of these data sets have been preprocessed to only include + images in a subset of the label classes, e.g. images in the first + 10 label classes. This is convenient for quickly evaluating + performance or learning behavior. The availabiilty of these + subsampled data sets may vary by system. + + """ + slist_dir = parallel_file_system_path(system) + if system in ('lassen', 'sierra'): + slist_dir += 'brainusr/datasets/ILSVRC2012/sample_list/' + else: + slist_dir += 'brainusr/datasets/ILSVRC2012/sample_list/' + suffixes = {1000: '', 10: '_c0-9', 100: '_c0-99', + 200: '_c100-299', 300: '_c0-299'} + if data_set.lower() in ('train', 'training'): + if num_classes in suffixes.keys(): + return os.path.join(slist_dir, + 'train' + suffixes[num_classes] + '_sample_list.txt') + else: + raise RuntimeError('invalid number of classes ({0}) ' + 'for ImageNet data set ({1})' + .format(num_classes, data_set)) + elif data_set.lower() in ('val', 'validation'): + if num_classes in suffixes.keys(): + return os.path.join(slist_dir, + 'val' + suffixes[num_classes] + '_sample_list.txt') + else: + raise RuntimeError('invalid number of classes ({0}) ' + 'for ImageNet data set ({1})' + .format(num_classes, data_set)) + elif data_set.lower() in ('test', 'testing'): + return os.path.join(slist_dir, 'test_sample_list.txt') + else: + raise RuntimeError('unknown ImageNet data set (' + data_set + ')') diff --git a/python/lbann/contrib/lc/systems.py b/python/lbann/contrib/lc/systems.py index 8e5641572ce..49a260f4559 100644 --- a/python/lbann/contrib/lc/systems.py +++ b/python/lbann/contrib/lc/systems.py @@ -23,6 +23,7 @@ def __init__(self, cores_per_node, gpus_per_node, scheduler): 'lassen': SystemParams(44, 4, 'lsf'), 'ray': SystemParams(40, 4, 'lsf'), 'sierra': SystemParams(44, 4, 'lsf'), + 'rzansel': SystemParams(44, 4, 'lsf'), } # Detect system diff --git a/python/lbann/contrib/olcf/__init__.py b/python/lbann/contrib/olcf/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/lbann/contrib/olcf/launcher.py b/python/lbann/contrib/olcf/launcher.py new file mode 100644 index 00000000000..75ea02aff53 --- /dev/null +++ b/python/lbann/contrib/olcf/launcher.py @@ -0,0 +1,102 @@ +import os +from lbann.contrib.olcf.systems import * +import lbann.launcher +from lbann.util import make_iterable + +def run(*args, **kwargs): + """Run LBANN with OLCF-specific optimizations (deprecated). + + This is deprecated. Use `lbann.contrib.launcher.run` instead. + + """ + + import warnings + warnings.warn( + 'Using deprecated function `lbann.contrib.olcf.launcher.run`. ' + 'Use `lbann.contrib.launcher.run` instead.' + ) + from ..launcher import run as _run + _run(*args, **kwargs) + +def make_batch_script( + system=system(), + procs_per_node=procs_per_node(), + scheduler=scheduler(), + launcher_args=[], + environment={}, + *args, + **kwargs, +): + """Construct batch script manager with OLCF-specific optimizations. + + This is a wrapper around `lbann.launcher.make_batch_script`, with + defaults and optimizations for LC systems. See that function for a + full list of options. + + """ + + # Create shallow copies of input arguments + launcher_args = list(make_iterable(launcher_args)) + environment = environment.copy() + + # Helper function to configure environment variables + # Note: User-provided values take precedence, followed by values + # in the environment, followed by default values. + def set_environment(key, default): + if key not in environment: + environment[key] = os.getenv(key, default) + + # Setup GPU bindings + # Note: Each Hydrogen process is assigned to the GPU index that + # matches its node communicator rank. This is not compatible with + # mpibind, which assigns a GPU with index 0 to each process. We + # can't use an exclusive GPU compute mode since processes may + # touch the wrong GPU while figuring out ownership. + if scheduler == 'slurm' and has_gpu(system): + launcher_args.extend(['--mpibind=off', + '--nvidia_compute_mode=default']) + + # Optimizations for Summit-like systems + if system in ('summit'): + + # Set thread affinity + # Note: Aluminum's default thread affinity is incorrect since + # hwloc treats GPUs as NUMA domains. + # Note: There are actually 22 cores/socket, but it seems that + # powers of 2 are better for performance. + cores_per_socket = 16 + procs_per_socket = (procs_per_node + 1) // 2 + cores_per_proc = cores_per_socket // procs_per_socket + set_environment('AL_PROGRESS_RANKS_PER_NUMA_NODE', procs_per_socket) + set_environment('OMP_NUM_THREADS', cores_per_proc) + if scheduler == 'lsf': + launcher_args.append('--bind packed:{}'.format(cores_per_proc)) + + # Hack to enable process forking + # Note: InfiniBand is known to experience hangs if an MPI + # process is forked (see + # https://www.open-mpi.org/faq/?category=openfabrics#ofa-fork). + # Setting IBV_FORK_SAFE seems to fix this issue, but it may + # hurt performance (see + # https://linux.die.net/man/3/ibv_fork_init). + set_environment('IBV_FORK_SAFE', 1) + + # Hacked bugfix for hcoll (1/23/19) + # Note: Fixes hangs in MPI_Bcast. + set_environment('HCOLL_ENABLE_SHARP', 0) + set_environment('OMPI_MCA_coll_hcoll_enable', 0) + + # Hacked bugfix for Spectrum MPI PAMI (9/17/19) + set_environment('PAMI_MAX_NUM_CACHED_PAGES', 0) + + # Configure NVSHMEM to load Spectrum MPI + set_environment('NVSHMEM_MPI_LIB_NAME', 'libmpi_ibm.so') + + return lbann.launcher.make_batch_script( + procs_per_node=procs_per_node, + scheduler=scheduler, + launcher_args=launcher_args, + environment=environment, + *args, + **kwargs, + ) diff --git a/python/lbann/contrib/olcf/paths.py b/python/lbann/contrib/olcf/paths.py new file mode 100644 index 00000000000..2974dc3aa4a --- /dev/null +++ b/python/lbann/contrib/olcf/paths.py @@ -0,0 +1,110 @@ +"""Useful file paths on OLCF systems.""" +import os.path +from lbann.contrib.lc.systems import system + +# ============================================== +# Data sets +# ============================================== + +def parallel_file_system_path(system = system()): + """Base path to parallel file system.""" + if system in ('summit'): + return '/ccs/proj/ast153/' + else: + return '/ccs/proj/ast153/' + +def mnist_dir(system = system()): + """MNIST directory on OLCF system. + + The directory contains four files: train-images-idx3-ubyte, + train-labels-idx1-ubyte, t10k-images-idx3-ubyte, + t10k-labels-idx1-ubyte. These files can be obtained by downloading + from http://yann.lecun.com/exdb/mnist/ and uncompressing. + + """ + raise AssertionError("Unimplemented data set") + return parallel_file_system_path(system) + '/datasets/MNIST' + +def cifar10_dir(system = system()): + """CIFAR10 directory on OLCF systems.""" + raise AssertionError("Unimplemented data set") + return parallel_file_system_path(system) + '/datasets/cifar10-bin' + +def imagenet_dir(system = system(), data_set = 'training', + num_classes = 1000): + """ImageNet directory on OLCF system. + + The directory contains JPEG images from the ILSVRC2012 + competition. File names in the label file are relative to this + directory. The images can be obtained from + http://image-net.org/challenges/LSVRC/2012/. + + There are three available data sets: 'training', 'validation', and + 'testing'. + + Some of these data sets have been preprocessed to only include + images in a subset of the label classes, e.g. images in the first + 10 label classes. This is convenient for quickly evaluating + performance or learning behavior. The availabiilty of these + subsampled data sets may vary by system. + + """ + raise AssertionError("Unimplemented data set") + base_path = parallel_file_system_path(system) + base_path += 'datasets/ILSVRC2012/original/' + if data_set.lower() in ('train', 'training'): + return base_path + 'train/' + elif data_set.lower() in ('val', 'validation'): + return base_path + 'val/' + elif data_set.lower() in ('test', 'testing'): + return base_path + 'test/' + else: + raise RuntimeError('unknown ImageNet data set (' + data_set + ')') + +def imagenet_labels(system = system(), data_set = 'train', + num_classes = 1000): + """ImageNet label file on OLCF system. + + The file contains ground truth labels from the ILSVRC2012 + competition. It is a plain text file where each line contains an + image file path (relative to the ImageNet directory; see the + `imagenet_dir` function) and the corresponding label ID. + + There are three available data sets: 'training', 'validation', and + 'testing'. + + Some of these data sets have been preprocessed to only include + images in a subset of the label classes, e.g. images in the first + 10 label classes. This is convenient for quickly evaluating + performance or learning behavior. The availabiilty of these + subsampled data sets may vary by system. + + """ + raise AssertionError("Unimplemented data set") + label_dir = parallel_file_system_path(system) + if system in ('lassen', 'sierra'): + label_dir += 'datasets/ILSVRC2012/original/labels/' + else: + label_dir += 'datasets/ILSVRC2012/labels/' + suffixes = {1000: '', 10: '_c0-9', 100: '_c0-99', + 200: '_c100-299', 300: '_c0-299'} + if data_set.lower() in ('train', 'training'): + if num_classes in suffixes.keys(): + return os.path.join(label_dir, + 'train' + suffixes[num_classes] + '.txt') + else: + raise RuntimeError('invalid number of classes ({0}) ' + 'for ImageNet data set ({1})' + .format(num_classes, data_set)) + elif data_set.lower() in ('val', 'validation'): + if num_classes in suffixes.keys(): + return os.path.join(label_dir, + 'val' + suffixes[num_classes] + '.txt') + else: + raise RuntimeError('invalid number of classes ({0}) ' + 'for ImageNet data set ({1})' + .format(num_classes, data_set)) + elif data_set.lower() in ('test', 'testing'): + return os.path.join(label_dir, 'test.txt') + else: + raise RuntimeError('unknown ImageNet data set (' + data_set + ')') diff --git a/python/lbann/contrib/olcf/systems.py b/python/lbann/contrib/olcf/systems.py new file mode 100644 index 00000000000..9a4fc408c63 --- /dev/null +++ b/python/lbann/contrib/olcf/systems.py @@ -0,0 +1,69 @@ +"""Default settings for OLCF systems.""" +import socket +import re + +# ============================================== +# Set system parameters +# ============================================== + +class SystemParams: + """Simple data structure to describe an OLCF system.""" + def __init__(self, cores_per_node, gpus_per_node, scheduler): + self.cores_per_node = cores_per_node + self.gpus_per_node = gpus_per_node + self.scheduler = scheduler + +# Supported LC systems +_system_params = { + 'summit': SystemParams(44, 6, 'lsf'), +} + +# Detect system +_system = re.sub(r'\d+', '', socket.gethostname()) +if _system not in _system_params.keys(): + _system = None + +# ============================================== +# Access functions +# ============================================== + +def system(): + """Name of OLCF system.""" + if _system: + return _system + else: + raise RuntimeError('unknown system ' + '(' + socket.gethostname() + ')') + +def is_olcf_system(system = system()): + """Whether current system is a supported OLCF system.""" + return (system is not None) and (system in _system_params.keys()) + +def gpus_per_node(system = system()): + """Number of GPUs per node.""" + if not is_olcf_system(system): + raise RuntimeError('unknown system (' + system + ')') + return _system_params[system].gpus_per_node + +def has_gpu(system = system()): + """Whether OLCF system has GPUs.""" + return gpus_per_node(system) > 0 + +def cores_per_node(system = system()): + """Number of CPU cores per node.""" + if not is_olcf_system(system): + raise RuntimeError('unknown system (' + system + ')') + return _system_params[system].cores_per_node + +def scheduler(system = system()): + """Job scheduler for OLCF system.""" + if not is_olcf_system(system): + raise RuntimeError('unknown system (' + system + ')') + return _system_params[system].scheduler + +def procs_per_node(system = system()): + """Default number of processes per node.""" + if has_gpu(system): + return gpus_per_node(system) + else: + raise RuntimeError('unknown system (' + system + ')') diff --git a/python/lbann/modules/__init__.py b/python/lbann/modules/__init__.py index f590e2c52bb..7adccba18a1 100644 --- a/python/lbann/modules/__init__.py +++ b/python/lbann/modules/__init__.py @@ -9,3 +9,4 @@ from lbann.modules.base import Module, FullyConnectedModule, ConvolutionModule, Convolution2dModule, Convolution3dModule from lbann.modules.rnn import LSTMCell, GRU from lbann.modules.transformer import MultiheadAttention +from lbann.modules.graph import * diff --git a/python/lbann/modules/graph/__init__.py b/python/lbann/modules/graph/__init__.py new file mode 100644 index 00000000000..c987a56a23b --- /dev/null +++ b/python/lbann/modules/graph/__init__.py @@ -0,0 +1,12 @@ +"""Graph neural network modules. + +Some common graph kernels for graph structured data commonly used for graph +convolutional networks. + +""" + +#import from sub modules + +from lbann.modules.graph.utils import GraphVertexData +from lbann.modules.graph.dense import DenseGCNConv, DenseGraphConv +from lbann.modules.graph.sparse import GCNConv, GINConv, GraphConv, GatedGraphConv diff --git a/python/lbann/modules/graph/dense/DenseGCNConv.py b/python/lbann/modules/graph/dense/DenseGCNConv.py new file mode 100644 index 00000000000..1d7f557ac28 --- /dev/null +++ b/python/lbann/modules/graph/dense/DenseGCNConv.py @@ -0,0 +1,27 @@ +import lbann +from lbann.modules import Module +from lbann.util import str_list +import math + + +class DenseGCNConv(Module): + global_count = 0 + + def __init__(self, input_channels, output_channels, name=None): + super().__init__() + DenseGCNConv.global_count += 1 + + self.name = (name if name else 'Dense_GCN_{}'.format(DenseGCNConv.global_count)) + + + bounds = math.sqrt(6.0 / (input_channels + output_channels)) + self.weights = lbann.Weights(initializer = lbann.UniformInitializer(min=-bounds,max=bounds), + name=self.name+'_Weights') + + self.W = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]), + name = self.name+'_layer', + weights =self.weights) + def forward(self,X,A): + out = lbann.MatMul(X,self.W, name=self.name+'_weight_mult') + out = lbann.MatMul(A, out, name=self.name+'_adj_mult') + return out diff --git a/python/lbann/modules/graph/dense/DenseGraphConv.py b/python/lbann/modules/graph/dense/DenseGraphConv.py new file mode 100644 index 00000000000..a2baddc5448 --- /dev/null +++ b/python/lbann/modules/graph/dense/DenseGraphConv.py @@ -0,0 +1,34 @@ +import lbann +from lbann.modules import Module +from lbann.util import str_list +import math + +class DenseGraphConv(Module): + global_count = 0 + def __init__(self, input_channels, output_channels, name=None): + super().__init__() + self.name = (name if name else 'DenseGraph_{}'.format(DenseGraphConv.global_count)) + + DenseGraphConv.global_count+=1 + + bounds = math.sqrt(6.0/(input_channels + output_channels)) + + self.weights_1 = lbann.Weights(initializer = lbann.UniformInitializer(min=-bounds, max=bounds), + name=self.name+'_Weights_1') + self.weights_2 = lbann.Weights(initializer = lbann.UniformInitializer(min=-bounds, max=bounds), + name=self.name+'_Weights_2') + self.W1 = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]), + name=self.name+'_param_1', + weights = self.weights_1) + self.W2 = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]), + name=self.name+'_param_2', + weights = self.weights_2) + def forward(self, X, A): + messages = lbann.MatMul(X, self.W2, name=self.name+'_w2_mult') + messages = lbann.MatMul(A,messages,name=self.name+'_adj_mult') + + ident = lbann.MatMul(X, self.W1, name=self.name+'_w1_mult') + + out = lbann.Sum(ident, messages, name=self.name+'_sum_id') + + return out diff --git a/python/lbann/modules/graph/dense/__init__.py b/python/lbann/modules/graph/dense/__init__.py new file mode 100644 index 00000000000..31f52337832 --- /dev/null +++ b/python/lbann/modules/graph/dense/__init__.py @@ -0,0 +1,7 @@ +from .DenseGCNConv import DenseGCNConv +from .DenseGraphConv import DenseGraphConv + +__all__ = [ + 'DenseGCNConv' + 'DenseGraphConv' + ] diff --git a/python/lbann/modules/graph/sparse/GCNConv.py b/python/lbann/modules/graph/sparse/GCNConv.py new file mode 100644 index 00000000000..5f44b4e29bc --- /dev/null +++ b/python/lbann/modules/graph/sparse/GCNConv.py @@ -0,0 +1,120 @@ +import lbann +from lbann.modules import Module +from lbann.modules.graph.utils import GraphVertexData +from lbann.util import str_list +import lbann.modules.base +import math + +class GCNConv(Module): + """GCN Conv later. See: + + https://arxiv.org/abs/1609.02907 + + """ + + global_count = 0 + + def __init__(self, + input_channels, + output_channels, + bias=True, + activation = lbann.Relu, + name=None, + data_layout = 'data_parallel'): + """Initialize GCN layer + + Args: + input_channels (int): The size of the input node features + output_channels (int): The output size of the node features + bias (bool): Whether to apply biases after MatMul + activation (type): Activation leyer for the node features. If None, then no activation is + applied. (default: lbann.Relu) + name (str): Default name of the layer is GCN_{number} + data_layout (str): Data layout + """ + super().__init__() + + ## Add variables + + self.input_channels = input_channels + self.output_channels = output_channels + self.data_layout = data_layout + + ## Add Name for the components for the layer + GCNConv.global_count +=1 + self.name = (name + if name + else 'GCN_{}'.format(GCNConv.global_count)) + + ## Initialize weights for the matrix + value = math.sqrt(6/ (input_channels + output_channels)) + + self.mat_weights = lbann.Weights(initializer = lbann.UniformInitializer( + min = -value, + max = value), + name = self.name+'_Weights') + + self.W = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]), + name = self.name+'_layer', + weights = self.mat_weights, + data_layout = self.data_layout) + + ## Initialize bias variables + self.has_bias = bias + self.bias_weights = None + self.bias = None + + if (self.has_bias): + self.bias_weights = lbann.Weights(initializer = lbann.ConstantInitializer( + value = 0.0), + name = self.name+'_bias_weights') + self.bias = lbann.WeightsLayer(dims = str_list([1,output_channels]), + weights = self.bias_weights, + name = self.name+'_bias_layer', + data_layout = self.data_layout) + + self.activation = None + + if activation: + if isinstance(activation, type): + self.activation = activation + else: + self.activation = type(actvation) + if not issubclass(self.activation, lbann.Layer): + raise ValueError('activation must be a layer') + + def forward(self, X, A): + """Apply GCN + + Args: + X (GraphVertexData): LBANN Data object, which is a collection of Layers. Each Layer is of + the shape (1,input_channels) + A (Layer): Adjacency matrix input with shape (num_nodes, num_nodes) + applied. The adjacency matrix is assumed to be normalized in the + pre-processing step. + Returns: + LBANN_Data_Mat: The output after GCN. The output can passed into another Graph Conv layer + directly + """ + + # Assume X is a lbann data object + for i in range(X.shape[0]): + X[i] = lbann.MatMul(X[i], self.W, name=self.name+'_message_'+str(i)) + if (self.bias): + X[i] = lbann.Sum(X[i], self.bias, name=self.name+'_message_bias_'+str(i)) + + # Pass Message to Node Features + out = X.get_mat(self.output_channels) + + # A - adjacency matrix is assumed to be normalized such that + # A = D^-0.5 A D^0.5 as the convention in the GCN paper. + out = lbann.MatMul(A, out, name=self.name+'_aggregate') + + if self.activation: + out = self.activation(out) + + out = GraphVertexData.matrix_to_graph(out, X.shape[0], self.output_channels) + + return out + + diff --git a/python/lbann/modules/graph/sparse/GINConv.py b/python/lbann/modules/graph/sparse/GINConv.py new file mode 100644 index 00000000000..cf5de075cca --- /dev/null +++ b/python/lbann/modules/graph/sparse/GINConv.py @@ -0,0 +1,76 @@ +import lbann +from lbann.modules import Module +from lbann.modules.graph.utils import GraphVertexData +from lbann.util import str_list + +class GINConv(Module): + """Details of the kernel is available in: + https://arxiv.org/abs/1810.00826 + """ + global_count = 0; + + def __init__(self, + sequential_nn, + output_channels, + eps = 1e-6, + name = None, + data_layout = 'data_parallel'): + """Initialize graph kernel as described in Graph Isomorphism Network. + + Args: + sequential_nn ([Module] or (Module)): A list or tuple of layer modules to be used + output_channels (int): The output size of the node features + eps (float): Default value is 1e-6 + name (str): Default name of the layer is GIN_{number} + data_layout (str): Data layout + """ + GINConv.global_count += 1 + self.name = (name + if name + else 'GIN_{}'.format(GINConv.global_count)) + self.data_layout = data_layout + self.nn = sequential_nn + self.eps = eps + self.output_channels = output_channels + + + def forward(self, X, A, activation = lbann.Relu): + """Apply GIN Layer. + + Args: + X (GraphVertexData): LBANN Data object, which is a collection of Layers. Each Layer is of + the shape (1,input_channels) + + A (Layer): Adjacency matrix input with shape (num_nodes, num_nodes) + + activation (Layer): Activation layer for the node features. If None, then no activation is + applied. (default: lbann.Relu) + Returns: + + (GraphVertexData): The output after GCN. The output can passed into another Graph Conv layer + directly + """ + in_channel = X.shape[1] + + # Accumulate Messages from Neighboring Nodes + out = X.get_mat() + out = lbann.MatMul(A,out, name = self.name+"_GIN_MATMUL") + message = GraphVertexData.matrix_to_graph(out, X.shape[0], in_channel) + + # Aggregate Messages into node features + eps = lbann.Constant(value=(1+self.eps),num_neurons = str_list([1, in_channel])) + for node_feature in range(X.shape[0]): + eps_val = lbann.Multiply(eps, X[node_feature]) + X[node_feature] = lbann.Sum(message[node_feature], eps_val) + + # Transform with the sequence of linear layers + for layer in self.nn: + for node_feature in range(X.shape[0]): + X[node_feature] = layer(X[node_feature]) + + ## Apply activation + if activation: + for node_feature in range(X.shape[0]): + X[node_feature] = activation(X[node_feature]) + X.update_num_features(self.output_channels) + return X diff --git a/python/lbann/modules/graph/sparse/GatedGraphConv.py b/python/lbann/modules/graph/sparse/GatedGraphConv.py new file mode 100644 index 00000000000..9cf2f09d7cd --- /dev/null +++ b/python/lbann/modules/graph/sparse/GatedGraphConv.py @@ -0,0 +1,98 @@ +import lbann +from lbann.modules import Module +from lbann.util import str_list +from lbann.modules.graph.utils import GraphVertexData +import lbann.modules +import math + +class GatedGraphConv(Module): + """Gated Graph Convolution layer. For kernel details, see: + + https://arxiv.org/abs/1511.05493 + + Implementation in the spirit of: + + https://github.com/rusty1s/pytorch_geometric/blob/\ + master/torch_geometric/nn/conv/gated_graph_conv.py + """ + global_count = 0 + def __init__(self, + output_channels, + num_layers = 1, + name = None): + """Initialize GatedGraph layer + Args: + output_channels (int): The output size of the node features + num_layers (int): Number of passes through the GRU (default: 1) + name (str): Name of the layers and prefix to use for the layers. + data_layout (str): Data layout (default: data parallel) + """ + super().__init__() + + ## Add Name for the components for the layer + GatedGraphConv.global_count +=1 + self.name = (name + if name + else 'GatedGraphConv_{}'.format(GatedGraphConv.global_count)) + + + ## Add variables + self.output_channels = output_channels + self.rnn = lbann.modules.GRU(output_channels) + + self.num_layers = num_layers + self.data_layout = data_layout + + self.weights = [] + + for i in range(num_layers): + + weight_init = lbann.Weights(initializer = lbann.UniformInitializer(min =-1/(math.sqrt(output_channels)), + max = 1/(math.sqrt(output_channels)))) + weight_layer = lbann.WeightsLayer(dims = str_list([output_channels, output_channels]), + weights = weight_init, + name = self.name+'_'+str(i)+'_weight', + data_layout = self.data_layout) + self.weights.append(weight_layer) + + + def forward(self, X, A): + """Call the GatedGraphConv + Args: + X (GraphVertexData): LBANN Data object, which is a collection of Layers. Each Layer is of + the shape (1,input_channels) + A (Layer): Adjacency matrix input with shape (num_nodes, num_nodes) + Returns: + LBANN_Data_Mat: The output after Gated Graph Kernel. + The output can passed into another Graph Conv layer directly + + """ + + input_features = X.size(1) + num_nodes = X.size(0) + + if (input_features < self.output_channels): + for i in range(num_nodes): + num_zeros = self.output_channels - input_features + zeros = lbann.Constant(value = 0, num_neurons = str_list([1,num_zeros]), name = self.name+'_zero_'+str(i)) + X[i] = lbann.Concatenation(X[i], zeros, axis = 1) + elif (input_features > self.output_channels): + ValueError('The feature size of the nodes {} cannot be greater than the output dimension {}'. + format(input_features, self.output_channels)) + + X.update_num_features(self.output_channels) + + for layer in range(self.num_layers): + ## + X_mat = X.get_mat() + messages = lbann.MatMul(X_mat, self.weights[layer]) + aggregate = lbann.MatMul(A,messages) + + M = GraphVertexData.matrix_to_graph(aggregate, num_nodes, self.output_channels) + + for i in range(num_nodes): + X[i] = lbann.Reshape(X[i], dims = str(self.output_channels)) + X[i] = lbann.Reshape(self.rnn(M[i], X[i])[1], + dims = str_list([1, self.output_channels])) + + return X diff --git a/python/lbann/modules/graph/sparse/GraphConv.py b/python/lbann/modules/graph/sparse/GraphConv.py new file mode 100644 index 00000000000..327d47bb945 --- /dev/null +++ b/python/lbann/modules/graph/sparse/GraphConv.py @@ -0,0 +1,132 @@ +import lbann +from lbann.modules import Module +from lbann.modules.graph.utils import GraphVertexData +from lbann.util import str_list +import lbann.modules.base +import math + +class GraphConv(Module): + """ Graph Conv layer. See: + + https://arxiv.org/abs/1609.02907 + + """ + + global_count = 0 + + def __init__(self, + input_channels, + output_channels, + bias=True, + activation = lbann.Relu, + name=None, + data_layout = 'data_parallel'): + """Initialize Graph layer + + Args: + input_channels (int): The size of the input node features + output_channels (int): The output size of the node features + bias (bool): Whether to apply biases after MatMul + name (str): Default name of the layer is GCN_{number} + data_layout (str): Data layout + activation (type): Activation layer for the node features. If None, then no activation is + applied. (default: lbann.Relu) + """ + super().__init__() + + ## Add variables + + self.input_channels = input_channels + self.output_channels = output_channels + self.data_layout = data_layout + + ## Add Name for the components for the layer + GraphConv.global_count +=1 + self.name = (name + if name + else 'Graph_{}'.format(GraphConv.global_count)) + + ## Initialize weights for the matrix + value = math.sqrt(6/ (input_channels + output_channels)) + + self.mat_weights = lbann.Weights(initializer = lbann.UniformInitializer( + min = -value, + max = value), + name = self.name+'_Weights') + + self.weights1 = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]), + name = self.name+'_layer', + weights = self.mat_weights) + + self.id_weights = lbann.Weights(initializer = lbann.UniformInitializer( + min = -value, + max = value), + name = self.name+'_ID_Weights') + + self.weights2 = lbann.WeightsLayer(dims = str_list([input_channels, output_channels]), + name = self.name+'_ID_layer', + weights = self.id_weights) + + ## Initialize bias variables + self.has_bias = bias + self.bias_weights = None + self.bias = None + + if (self.has_bias): + self.bias_weights = lbann.Weights(initializer = lbann.ConstantInitializer( + value = 0.0), + name = self.name+'_bias_weights') + self.bias = lbann.WeightsLayer(dims = str_list([1,output_channels]), + weights = self.bias_weights, + name = self.name+'_bias_layer') + + self.activation = None + + if activation: + if isinstance(activation, type): + self.activation = activation + else: + self.activation = type(actvation) + if not issubclass(self.activation, lbann.Layer): + raise ValueError('activation must be a layer') + + def forward(self, X, A): + """Apply Graph Conv Layer to X and use A for message passing + + Args: + X (GraphVertexData): LBANN Data object, which is a collection of Layers. Each Layer is of + the shape (1,input_channels) + + A (Layer): Adjacency matrix input with shape (num_nodes, num_nodes) + + Returns: + + GraphVertexData: The output after convolution. The output can passed into another Graph Conv layer + directly + """ + + # Accumulate Messages from Neighboring Nodes + out = X.get_mat() + out = lbann.MatMul(out,self.weights1, name = self.name+"_Graph_MATMUL") + message = lbann.MatMul(A, out, name = self.name+"_Graph_Message") + message = GraphVertexData.matrix_to_graph(message, X.shape[0], self.output_channels) + + # Assume X is a GraphVertexData object + + for node_feature in range(X.shape[0]): + X[node_feature] = lbann.MatMul(X[node_feature], self.weights2) + + for node_feature in range(X.shape[0]): + if (self.bias): + message[node_feature] = lbann.Sum(message[node_feature], + self.bias, + name=self.name+'_message_bias_'+str(node_feature)) + X[node_feature] = lbann.Sum(X[node_feature], message[node_feature]) + + if self.activation: + for node_feature in range(X.shape[0]): + X[node_feature] = self.activation(X[node_feature]) + + X.update_num_features(self.output_channels) + return X + diff --git a/python/lbann/modules/graph/sparse/__init__.py b/python/lbann/modules/graph/sparse/__init__.py new file mode 100644 index 00000000000..9b089525863 --- /dev/null +++ b/python/lbann/modules/graph/sparse/__init__.py @@ -0,0 +1,11 @@ +"Neural network modules for graph convolutional models.""" +from .GINConv import GINConv +from .GCNConv import GCNConv +from .GraphConv import GraphConv +from .GatedGraphConv import GatedGraphConv +__all__ = [ + 'GCNConv', + 'GINConv', + 'GraphConv', + 'GatedGraphConv' + ] diff --git a/python/lbann/modules/graph/utils.py b/python/lbann/modules/graph/utils.py new file mode 100644 index 00000000000..7c9aae6fd94 --- /dev/null +++ b/python/lbann/modules/graph/utils.py @@ -0,0 +1,103 @@ +import lbann +from lbann.util import str_list + +class GraphVertexData: + def __init__(self, layers, num_features): + """Object to hold list of layers, where each layer represents a vertex + in a graph. + + Args: + layers (iterator of layers): One dimensional iterator of node + features with N number of ndoes + num_features (int) : the number of features per vertex + + """ + self.shape = (len(layers), num_features) + self.layers = layers + self.num_nodes = len(layers) + self.num_features = num_features + + def __getitem__(self, node): + """Get the feature vector of the None node represented as an LBANN layer + + args: node (int): The node to retrieve the features for. + + returns: (Layer) : returns the features of the Vertex of the graph. + + """ + return self.layers[node] + def __setitem__(self, node, feature): + """Set the value of the row-th layer in + args: row (int): + layer (Layer): + """ + self.layers[node] = feature + def update_num_features(self, num_features): + """Update the internal shapes to keep track of features + + Args: + num_features (int): the features per vertex + """ + self.num_features = num_features + self.shape = (len(self.layers), num_features) + def size(self, index = None): + """Get the size (shape) of the GraphVertexObject, where the size is represented + as a tuple (n,m), where n is the number of nodes and m is the number of + features per node. + + args: index (int): 0 to return the number of nodes and 1 to return the number of + features. + returns: (int) or (int,int): Either returns the tuple (n,m) or n or m. + + """ + if isinstance(index,int): + return self.shape[index] + else: + return self.shape + + def get_mat(self, cols = None): + """Generates a matrix representation of the graph data. + + args: cols (int) + """ + + mat = lbann.Concatenation(self.layers) + + if (cols): + mat = lbann.Reshape(mat, dims=str_list([self.shape[0], cols])) + else: + mat = lbann.Reshape(mat, dims=str_list([self.shape[0], self.shape[1]])) + + return mat + + def clone(self): + """Generates a clone of the GraphVertexData object. Results in a + splitting in the DAG. + """ + cloned_layers = [] + + for i,node in enumerate(self.layers): + temp = lbann.Split(node) + layers[i] = lbann.Identity(temp) + cloned_layers.append(lbann.Identity(temp)) + + + return GraphVertexData(cloned_layers, self.num_features) + + + @classmethod + def matrix_to_graph(cls, mat_layer, num_vertices, num_features): + """Given a 2D matrix of shape (num_vertices, num_features), returns a + GraphVertexData object with num_vertices number of nodes with num_features. + + """ + + slice_points = str_list([i for i in range(0,num_vertices * num_features + 1, num_features)]) + flattened_layer = lbann.Reshape(mat_layer, dims = str(num_vertices * num_features)) + sliced_mat_layer = lbann.Slice(flattened_layer, axis = 0, slice_points = slice_points) + + list_of_layers = [] + for node in range(num_vertices): + temp = lbann.Identity(sliced_mat_layer) + list_of_layers.append(lbann.Reshape(temp, dims=str_list([1, num_features]))) + return cls(list_of_layers, num_features) diff --git a/python/lbann/modules/rnn.py b/python/lbann/modules/rnn.py index 4d3ee1a5982..630ca23e31a 100644 --- a/python/lbann/modules/rnn.py +++ b/python/lbann/modules/rnn.py @@ -213,6 +213,13 @@ def __init__(self, size, bias = True, data_layout=self.data_layout ) + self.ones = lbann.Constant( + value=1.0, + num_neurons=str(size), + data_layout=self.data_layout, + name=self.name+'_ones', + ) + def forward(self, x, prev_state): """Apply GRU step. @@ -285,7 +292,7 @@ def forward(self, x, prev_state): lbann.Add( lbann.Multiply( lbann.WeightedSum( - lbann.Constant(value=1.0, hint_layer=zt, data_layout=self.data_layout), + self.ones, zt, scaling_factors='1 -1', data_layout=self.data_layout ), diff --git a/scripts/build_lbann_from_source.sh b/scripts/build_lbann_from_source.sh index 3320cbcf355..ec97d078a70 100755 --- a/scripts/build_lbann_from_source.sh +++ b/scripts/build_lbann_from_source.sh @@ -7,7 +7,7 @@ if [ -n "${SPACK_ROOT}" ]; then fi SPACK_VERSION=$(spack --version | sed 's/-.*//g') -MIN_SPACK_VERSION=0.13.3 +MIN_SPACK_VERSION=0.15.4 source $(dirname ${BASH_SOURCE})/utilities.sh @@ -34,6 +34,7 @@ fi LBANN_HOME=$(dirname ${SCRIPTS_DIR}) SPACK_ENV_DIR=${LBANN_HOME}/spack_environments +NINJA_NUM_PROCESSES=0 # Let ninja decide # Identify the center that we are running at CENTER= @@ -46,11 +47,19 @@ if [[ ${SYS} = "Darwin" ]]; then BUILD_SUFFIX=llnl.gov else CORI=$([[ $(hostname) =~ (cori|cgpu) ]] && echo 1 || echo 0) + DOMAINNAME=$(python -c 'import socket; domain = socket.getfqdn().split("."); print(domain[-2] + "." + domain[-1])') if [[ ${CORI} -eq 1 ]]; then CENTER="nersc" # Make sure to purge and setup the modules properly prior to finding the Spack architecture source ${SPACK_ENV_DIR}/${CENTER}/setup_modules.sh BUILD_SUFFIX=nersc.gov + elif [[ ${DOMAINNAME} = "ornl.gov" ]]; then + CENTER="olcf" + BUILD_SUFFIX=${DOMAINNAME} + NINJA_NUM_PROCESSES=16 # Don't let OLCF kill build jobs + elif [[ ${DOMAINNAME} = "llnl.gov" ]]; then + CENTER="llnl_lc" + BUILD_SUFFIX=${DOMAINNAME} else CENTER="llnl_lc" BUILD_SUFFIX=llnl.gov @@ -102,6 +111,7 @@ Options: ${C}--instrument${N} Use -finstrument-functions flag, for profiling stack traces ${C}-s | --superbuild${N} Superbuild LBANN with hydrogen and aluminum ${C}-c | --distconv${N} Enable the DistConv library + ${C}--ninja-processes${N} Number of parallel processes for ninja. EOF } @@ -208,6 +218,15 @@ while :; do # MPI-CUDA backend is required for Distconv ALUMINUM_WITH_MPI_CUDA=ON ;; + --ninja-processes) + if [ -n "${2}" ]; then + NINJA_NUM_PROCESSES=${2} + shift + else + echo "\"${1}\" option requires a non-empty option argument" >&2 + exit 1 + fi + ;; -?*) # Unknown option echo "Unknown option (${1})" >&2 @@ -266,9 +285,16 @@ fi source ${SPACK_ENV_DIR}/${SUPERBUILD} -ninja install +if [ ${NINJA_NUM_PROCESSES} -ne 0 ]; then + BUILD_COMMAND="ninja -j${NINJA_NUM_PROCESSES}" +else + # Usually equivalent to -j + BUILD_COMMAND="ninja" +fi + +${BUILD_COMMAND} install echo "To rebuild the environment:" echo " ${SPACK_ENV_CMD}" echo " cd ${LBANN_BUILD_DIR}" -echo " ninja install" +echo " ${BUILD_COMMAND} install" diff --git a/scripts/build_lbann_lc.sh b/scripts/build_lbann_lc.sh index 8745243528d..a41b00e2975 100755 --- a/scripts/build_lbann_lc.sh +++ b/scripts/build_lbann_lc.sh @@ -330,7 +330,7 @@ fi # Load packages if [ ${USE_MODULES} -ne 0 ]; then module load git - module load cmake/3.14.5 + module load cmake/3.16.8 else use git fi @@ -805,7 +805,6 @@ cmake \ -D LBANN_SB_BUILD_PROTOBUF=ON \ -D LBANN_SB_BUILD_CUB=${WITH_CUB} \ -D LBANN_SB_BUILD_ALUMINUM=${WITH_ALUMINUM} \ --D ALUMINUM_TAG=v0.3.3 \ -D ALUMINUM_ENABLE_MPI_CUDA=${ALUMINUM_WITH_MPI_CUDA} \ -D ALUMINUM_ENABLE_NCCL=${ALUMINUM_WITH_NCCL} \ -D LBANN_SB_BUILD_CONDUIT=${WITH_CONDUIT} \ diff --git a/scripts/install_lbann.sh b/scripts/install_lbann.sh index e82b3587130..8778bd17b89 100755 --- a/scripts/install_lbann.sh +++ b/scripts/install_lbann.sh @@ -7,7 +7,7 @@ if [ -n "${SPACK_ROOT}" ]; then fi SPACK_VERSION=$(spack --version | sed 's/-.*//g') -MIN_SPACK_VERSION=0.13.3 +MIN_SPACK_VERSION=0.15.4 source $(dirname ${BASH_SOURCE})/utilities.sh @@ -41,10 +41,15 @@ if [[ ${SYS} = "Darwin" ]]; then CENTER="osx" else CORI=$([[ $(hostname) =~ (cori|cgpu) ]] && echo 1 || echo 0) + DOMAINNAME=$(python -c 'import socket; domain = socket.getfqdn().split("."); print(domain[-2] + "." + domain[-1])') if [[ ${CORI} -eq 1 ]]; then CENTER="nersc" # Make sure to purge and setup the modules properly prior to finding the Spack architecture source ${SPACK_ENV_DIR}/${CENTER}/setup_modules.sh + elif [[ ${DOMAINNAME} = "ornl.gov" ]]; then + CENTER="olcf" + elif [[ ${DOMAINNAME} = "llnl.gov" ]]; then + CENTER="llnl_lc" else CENTER="llnl_lc" fi @@ -56,7 +61,7 @@ SPACK_ARCH_TARGET=$(spack arch -t) SCRIPT=$(basename ${BASH_SOURCE}) BUILD_DIR=${LBANN_HOME}/build/spack ENABLE_GPUS=ON -GPU_VARIANTS="+gpu+nccl" +GPU_VARIANTS="+cuda+nccl" ENABLE_HALF=OFF HALF_VARIANTS="~half" BUILD_TYPE=Release @@ -158,12 +163,14 @@ DIHYDROGEN_VARIANTS="variants: +shared +al +openmp ${HALF_VARIANTS}" if [[ ${DEPS_ONLY} = "TRUE" ]]; then if [[ ${SYS} != "Darwin" ]]; then HYDROGEN_VARIANTS="${HYDROGEN_VARIANTS} +openmp_blas" + DIHYDROGEN_VARIANTS="${DIHYDROGEN_VARIANTS} +openmp_blas" COMPILER_PACKAGE=$(cat <= partner_score)) { + if ((m_low_score_wins && local_score <= partner_score) + || (!m_low_score_wins && local_score >= partner_score) + || (!std::isnan(local_score) && std::isnan(partner_score))) { tournament_winner = local_trainer; switch (m_comm_algo) { case communication_algorithm::sendrecv_weights: diff --git a/src/callbacks/profiler.cpp b/src/callbacks/profiler.cpp index d95cb7e05ba..aee74bef640 100644 --- a/src/callbacks/profiler.cpp +++ b/src/callbacks/profiler.cpp @@ -47,7 +47,7 @@ namespace callback { profiler::profiler(bool sync, bool skip_init) : callback_base(), m_sync(sync), m_skip_init(skip_init) { #ifdef LBANN_NVPROF - nvtxNameCudaStreamA(El::GPUManager::Stream(), "Hydrogen"); + nvtxNameCudaStreamA(hydrogen::cuda::GetDefaultStream(), "Hydrogen"); #endif if (!m_skip_init) { prof_start(); diff --git a/src/callbacks/sync_layers.cpp b/src/callbacks/sync_layers.cpp index f2f3efdeb0f..fea7e50c55e 100644 --- a/src/callbacks/sync_layers.cpp +++ b/src/callbacks/sync_layers.cpp @@ -58,7 +58,7 @@ void sync_layers::on_backward_prop_end(model *m, Layer *l) { void sync_layers::do_sync(Layer *l) { #ifdef LBANN_HAS_CUDNN if (m_sync_gpus) { - El::GPUManager::SynchronizeDevice(); + hydrogen::gpu::SynchronizeDevice(); } #endif if (m_sync_mpi) { diff --git a/src/comm.cpp b/src/comm.cpp index ff07ee7ca9c..92f97b57931 100644 --- a/src/comm.cpp +++ b/src/comm.cpp @@ -167,7 +167,7 @@ void UpdateRequest(typename ::Al::NCCLBackend::req_type& req, El::SyncInfo const& si) noexcept { if (req) - req->orig_stream = si.stream_; + req->orig_stream = si.Stream(); } #endif // AL_HAS_NCCL @@ -181,7 +181,7 @@ void UpdateRequest(typename ::Al::MPICUDABackend::req_type& req, El::SyncInfo const& si) noexcept { if (req) - req->orig_stream = si.stream_; + req->orig_stream = si.Stream(); } #endif // AL_HAS_MPI_CUDA #endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_ALUMINUM) diff --git a/src/data_coordinator/data_coordinator.cpp b/src/data_coordinator/data_coordinator.cpp index 13c3178646e..c1e63be17fd 100644 --- a/src/data_coordinator/data_coordinator.cpp +++ b/src/data_coordinator/data_coordinator.cpp @@ -26,6 +26,7 @@ #include #include +#include namespace lbann { @@ -94,13 +95,27 @@ void data_coordinator::calculate_num_iterations_per_epoch(int max_mini_batch_siz " :: generic_data_distribution: number of parallel readers is zero"); } +#ifdef LBANN_HAS_DISTCONV + if (dc::is_cosmoflow_parallel_io_enabled()) { + // #trainers is assumed to be 1. + assert_eq(this->m_comm->get_num_trainers(), 1); + } +#endif + /// Set the basic parameters for stride and offset of the data reader int batch_stride = max_mini_batch_size; int base_offset = this->m_comm->get_rank_in_trainer(); +#ifdef LBANN_HAS_DISTCONV + base_offset = dc::get_input_rank(*(this->m_comm)) / dc::get_number_of_io_partitions(); +#endif /// Set mini-batch size and stride data_reader->set_mini_batch_size(max_mini_batch_size); data_reader->set_stride_to_next_mini_batch(batch_stride); +#ifdef LBANN_HAS_DISTCONV + data_reader->set_sample_stride(num_parallel_readers_per_model / dc::get_number_of_io_partitions()); +#else data_reader->set_sample_stride(num_parallel_readers_per_model); +#endif data_reader->set_iteration_stride(1); /// Set data reader base offset and model offset data_reader->set_base_offset(base_offset); @@ -116,7 +131,6 @@ void data_coordinator::calculate_num_iterations_per_epoch(int max_mini_batch_siz data_reader->set_num_iterations_per_epoch(num_iterations_per_epoch); data_reader->set_last_mini_batch_size(last_mini_batch_size); data_reader->set_stride_to_last_mini_batch(data_reader->get_stride_to_next_mini_batch()); - data_reader->set_global_mini_batch_size(max_mini_batch_size); data_reader->set_global_last_mini_batch_size(last_mini_batch_size); return; diff --git a/src/data_coordinator/data_coordinator_metadata.cpp b/src/data_coordinator/data_coordinator_metadata.cpp index 0189e8f8ae1..45f699a2ce6 100644 --- a/src/data_coordinator/data_coordinator_metadata.cpp +++ b/src/data_coordinator/data_coordinator_metadata.cpp @@ -37,6 +37,8 @@ std::string to_string(const data_reader_target_mode m) { return "regression"; case data_reader_target_mode::RECONSTRUCTION: return "reconstruction"; + case data_reader_target_mode::LABEL_RECONSTRUCTION: + return "label_reconstruction"; case data_reader_target_mode::INPUT: return "input"; case data_reader_target_mode::NA: diff --git a/src/data_readers/CMakeLists.txt b/src/data_readers/CMakeLists.txt index 780d74e0b1a..2b27750695e 100644 --- a/src/data_readers/CMakeLists.txt +++ b/src/data_readers/CMakeLists.txt @@ -21,5 +21,10 @@ set_full_path(THIS_DIR_SOURCES data_reader_smiles.cpp ) +if (LBANN_HAS_DISTCONV) + list(APPEND THIS_DIR_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/data_reader_hdf5.cpp") +endif () + # Propagate the files up the tree set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE) diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index a3669ff8572..4530044373f 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -623,17 +623,20 @@ std::string generic_data_reader::get_local_file_dir() const { return m_local_file_dir; } -void generic_data_reader::set_data_index_list(std::string s) { - m_data_index_list = s; +void generic_data_reader::set_data_sample_list(std::string s) { + m_data_sample_list = s; } -std::string generic_data_reader::get_data_index_list() const { - if (m_data_index_list == "") { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: you apparently did not call set_data_index_list; error!"); - } - return m_data_index_list; +std::string generic_data_reader::get_data_sample_list() const { + return m_data_sample_list; +} + +void generic_data_reader::keep_sample_order(bool same_order) { + // The sample_list::keep_sample_order() should be called using this + // flag. By doing so, it will add additional step to re-shuffle the + // sample order to restore it to the original before the loading + // with interleaving accesses by multiple ranks in a trainer. + m_keep_sample_order = same_order; } void generic_data_reader::set_data_filename(std::string s) { @@ -866,7 +869,7 @@ void generic_data_reader::print_get_methods(const std::string filename) { out << "get_file_dir " << get_file_dir() << std::endl; out << "get_local_file_dir " << get_local_file_dir() << std::endl; - out << "get_data_index_list " << get_data_index_list() << std::endl; + out << "get_data_sample_list " << get_data_sample_list() << std::endl; out << "get_data_filename " << get_data_filename() << std::endl; out << "get_label_filename " << get_label_filename() << std::endl; out << "get_role " << get_role() << std::endl; diff --git a/src/data_readers/data_reader_hdf5.cpp b/src/data_readers/data_reader_hdf5.cpp new file mode 100644 index 00000000000..b7a180602af --- /dev/null +++ b/src/data_readers/data_reader_hdf5.cpp @@ -0,0 +1,434 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////// +#include "lbann/data_readers/data_reader_hdf5.hpp" +#include "lbann/utils/profiling.hpp" +#include "lbann/utils/distconv.hpp" +#include "conduit/conduit_relay.hpp" +#include "conduit/conduit_relay_io_hdf5.hpp" + +#include +#include +#include +#include +#include +#include + +namespace { +inline hid_t check_hdf5(hid_t hid, const char *file, int line) { + if (hid < 0) { + std::cerr << "HDF5 error" << std::endl; + std::cerr << "Error at " << file << ":" << line << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + return hid; +} +} // namespace + +#define CHECK_HDF5(call) check_hdf5(call, __FILE__, __LINE__) + +namespace lbann { + +template +hdf5_reader::hdf5_reader(const bool shuffle, + const std::string key_data, + const std::string key_labels, + const std::string key_responses, + const bool hyperslab_labels) + : generic_data_reader(shuffle), + m_use_data_store(options::get()->get_bool("use_data_store")), + m_key_data(key_data), + m_key_labels(key_labels), + m_key_responses(key_responses), + m_hyperslab_labels(hyperslab_labels) { +} + +template +hdf5_reader::hdf5_reader(const hdf5_reader& rhs) : generic_data_reader(rhs) { + copy_members(rhs); +} + +template +hdf5_reader& hdf5_reader::operator=(const hdf5_reader& rhs) { + // check for self-assignment + if (this == &rhs) { + return (*this); + } + generic_data_reader::operator=(rhs); + copy_members(rhs); + return (*this); +} + +template +void hdf5_reader::copy_members(const hdf5_reader &rhs) { + if(rhs.m_data_store != nullptr) { + m_data_store = new data_store_conduit(rhs.get_data_store()); + } + m_data_store->set_data_reader_ptr(this); + + m_has_labels = rhs.m_has_labels; + m_has_responses = rhs.m_has_responses; + m_num_features = rhs.m_num_features; + m_data_dims = rhs.m_data_dims; + m_hyperslab_dims = rhs.m_hyperslab_dims; + m_comm = rhs.m_comm; + m_file_paths = rhs.m_file_paths; + m_use_data_store = rhs.m_use_data_store; + m_key_data = rhs.m_key_data; + m_key_labels = rhs.m_key_labels; + m_key_responses = rhs.m_key_responses; + m_hyperslab_labels = rhs.m_hyperslab_labels; + m_all_responses = rhs.m_all_responses; +} + +template +void hdf5_reader::read_hdf5_hyperslab(hsize_t h_data, hsize_t filespace, + int rank, TensorDataType *sample) { + prof_region_begin("read_hdf5_hyperslab", prof_colors[0], false); + // this is the splits, right now it is hard coded to split along the + // z axis + int num_io_parts = dc::get_number_of_io_partitions(); + + // how many times the pattern should repeat in the hyperslab + hsize_t count[4] = {1,1,1,1}; + + // necessary for the hdf5 lib + hid_t memspace = H5Screate_simple(4, m_hyperslab_dims.data(), NULL); + int spatial_offset = rank % num_io_parts; + hsize_t offset[4] = {0, m_hyperslab_dims[1] * spatial_offset, 0, 0}; + + // from an explanation of the hdf5 select_hyperslab: + // start -> a starting location for the hyperslab + // stride -> the number of elements to separate each element or block to be selected + // count -> the number of elemenets or blocks to select along each dimension + // block -> the size of the block selected from the dataspace + //hsize_t status; + + CHECK_HDF5(H5Sselect_hyperslab(filespace, H5S_SELECT_SET, + offset, NULL, count, + m_hyperslab_dims.data())); + + CHECK_HDF5(H5Dread(h_data, get_hdf5_data_type(), memspace, + filespace, m_dxpl, sample)); + prof_region_end("read_hdf5_hyperslab", false); +} + +template +void hdf5_reader::read_hdf5_sample(int data_id, TensorDataType *sample, + TensorDataType *labels) { + int world_rank = get_comm()->get_rank_in_trainer(); + auto file = m_file_paths[data_id]; + hid_t h_file = CHECK_HDF5(H5Fopen(file.c_str(), H5F_ACC_RDONLY, m_fapl)); + + // load in dataset + hid_t h_data = CHECK_HDF5( + H5Dopen(h_file, m_key_data.c_str(), H5P_DEFAULT)); + hid_t filespace = CHECK_HDF5(H5Dget_space(h_data)); + //get the number of dimesnionse from the dataset + int rank1 = H5Sget_simple_extent_ndims(filespace); + hsize_t dims[rank1]; + // read in what the dimensions are + CHECK_HDF5(H5Sget_simple_extent_dims(filespace, dims, NULL)); + + read_hdf5_hyperslab(h_data, filespace, world_rank, sample); + //close data set + CHECK_HDF5(H5Dclose(h_data)); + + if (m_has_labels && labels != nullptr) { + assert_always(m_hyperslab_labels); + hid_t h_labels = CHECK_HDF5(H5Dopen(h_file, m_key_labels.c_str(), H5P_DEFAULT)); + hid_t filespace_labels = CHECK_HDF5(H5Dget_space(h_labels)); + read_hdf5_hyperslab(h_labels, filespace_labels, world_rank, labels); + CHECK_HDF5(H5Dclose(h_labels)); + } else if (m_has_responses) { + assert_always(labels == nullptr); + h_data = CHECK_HDF5(H5Dopen(h_file, m_key_responses.c_str(), H5P_DEFAULT)); + CHECK_HDF5(H5Dread(h_data, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, &m_all_responses[0])); + CHECK_HDF5(H5Dclose(h_data)); + } + CHECK_HDF5(H5Fclose(h_file)); + return; +} + +template +void hdf5_reader::load() { + lbann_comm* l_comm = get_comm(); + MPI_Comm mpi_comm = l_comm->get_trainer_comm().GetMPIComm(); + int world_rank = l_comm->get_rank_in_trainer(); + int color = world_rank / dc::get_number_of_io_partitions(); + MPI_Comm_split(mpi_comm, color, world_rank, &m_comm); + m_shuffled_indices.clear(); + m_shuffled_indices.resize(m_file_paths.size()); + std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); + int nprocs; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + if ((nprocs % dc::get_number_of_io_partitions()) !=0) { + LBANN_ERROR("nprocs should be divisible by num of io partitions otherwise this wont work"); + } + + // Read the dimension size of the first sample, + // assuming that all of the samples have the same dimension size + if (m_file_paths.size() > 0) { + const hid_t h_file = CHECK_HDF5(H5Fopen(m_file_paths[0].c_str(), + H5F_ACC_RDONLY, H5P_DEFAULT)); + const hid_t h_data = CHECK_HDF5(H5Dopen(h_file, m_key_data.c_str(), + H5P_DEFAULT)); + const hid_t h_space = CHECK_HDF5(H5Dget_space(h_data)); + if (CHECK_HDF5(H5Sget_simple_extent_ndims(h_space)) != 4) { + LBANN_ERROR("The number of dimensions of HDF5 data samples should be 4"); + } + hsize_t dims[4]; + CHECK_HDF5(H5Sget_simple_extent_dims(h_space, dims, NULL)); + CHECK_HDF5(H5Dclose(h_data)); + m_data_dims = std::vector(dims, dims+4); + } else { + LBANN_ERROR("The number of HDF5 samples should not be zero"); + } + + m_num_features = std::accumulate(m_data_dims.begin(), + m_data_dims.end(), + (size_t) 1, + std::multiplies()); + + for (auto i: m_data_dims) { + m_hyperslab_dims.push_back(i); + } + // Partition the z dimension + m_hyperslab_dims[1] /= dc::get_number_of_io_partitions(); + +#define DATA_READER_HDF5_USE_MPI_IO +#ifdef DATA_READER_HDF5_USE_MPI_IO + m_fapl = CHECK_HDF5(H5Pcreate(H5P_FILE_ACCESS)); + CHECK_HDF5(H5Pset_fapl_mpio(m_fapl, m_comm, MPI_INFO_NULL)); + m_dxpl = CHECK_HDF5(H5Pcreate(H5P_DATASET_XFER)); + CHECK_HDF5(H5Pset_dxpl_mpio(m_dxpl, H5FD_MPIO_INDEPENDENT)); // H5FD_MPIO_COLLECTIVE +#else + m_fapl = H5P_DEFAULT; + m_dxpl = H5P_DEFAULT; +#endif + std::vector local_list_sizes; + options *opts = options::get(); + if (opts->get_bool("preload_data_store")) { + LBANN_ERROR("preload_data_store not supported on HDF5 data reader"); + } + if (m_use_data_store) { + instantiate_data_store(); + } + + select_subset_of_data(); + MPI_Comm_dup(dc::get_mpi_comm(), &m_response_gather_comm); +} + +template +bool hdf5_reader::fetch_label(Mat& Y, int data_id, int mb_idx) { + if(!m_has_labels) { + return generic_data_reader::fetch_label(Y, data_id, mb_idx); + } + + prof_region_begin("fetch_label", prof_colors[0], false); + assert_always(m_hyperslab_labels); + assert_always(m_use_data_store); + TensorDataType *buf = nullptr; + assert_eq(Y.Height(), m_num_features); + conduit::Node node; + const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); + node.set_external(ds_node); + const std::string conduit_obj = LBANN_DATA_ID_STR(data_id); + buf = node[conduit_obj+"/labels_slab"].value(); + std::memcpy(Y.Buffer(), buf, m_num_features/dc::get_number_of_io_partitions()*sizeof(TensorDataType)); + prof_region_end("fetch_label", false); + return true; +} + +template +bool hdf5_reader::fetch_datum(Mat& X, int data_id, int mb_idx) { + prof_region_begin("fetch_datum", prof_colors[0], false); + + // In the Cosmoflow case, each minibatch should have only one + // sample per rank. + assert_eq(X.Width(), 1); + assert_eq(sizeof(DataType)%sizeof(TensorDataType), 0); + assert_eq(X.Height(), + m_num_features / dc::get_number_of_io_partitions() + / (sizeof(DataType) / sizeof(TensorDataType))); + + if (m_use_data_store) { + fetch_datum_conduit(X, data_id); + } else { + read_hdf5_sample(data_id, (TensorDataType*)X.Buffer(), nullptr); + } + prof_region_end("fetch_datum", false); + return true; +} + +template +void hdf5_reader::fetch_datum_conduit(Mat& X, int data_id) { + const std::string conduit_key = LBANN_DATA_ID_STR(data_id); + // Create a node to hold all of the data + conduit::Node node; + if (data_store_active()) { + prof_region_begin("get_conduit_node", prof_colors[0], false); + const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); + node.set_external(ds_node); + prof_region_end("get_conduit_node", false); + } else { + auto &conduit_obj = node[conduit_key + "/slab"]; + conduit_obj.set(get_conduit_data_type( + m_num_features / dc::get_number_of_io_partitions())); + TensorDataType *sample_buf = conduit_obj.value(); + if(m_has_labels) { + assert_always(m_hyperslab_labels); + auto &conduit_labels_obj = node[conduit_key + "/labels_slab"]; + conduit_labels_obj.set(get_conduit_data_type( + m_num_features / dc::get_number_of_io_partitions())); + TensorDataType *labels_buf = conduit_labels_obj.value(); + read_hdf5_sample(data_id, sample_buf, labels_buf); + } else { + read_hdf5_sample(data_id, sample_buf, nullptr); + } + if(m_has_responses) { + node[conduit_key + "/responses"].set( + &m_all_responses[0], + m_all_responses.size()); + } + if (priming_data_store()) { + // Once the node has been populated save it in the data store + m_data_store->set_conduit_node(data_id, node); + } + } + prof_region_begin("set_external", prof_colors[0], false); + conduit::Node slab; + slab.set_external(node[conduit_key + "/slab"]); + prof_region_end("set_external", false); + TensorDataType *data = slab.value(); + prof_region_begin("copy_to_buffer", prof_colors[0], false); + std::memcpy(X.Buffer(), data, slab.dtype().number_of_elements()*slab.dtype().element_bytes()); + prof_region_end("copy_to_buffer", false); +} + +//get from a cached response +template +bool hdf5_reader::fetch_response(Mat& Y, int data_id, int mb_idx) { + if(!m_has_responses) { + return generic_data_reader::fetch_response(Y, data_id, mb_idx); + } + + prof_region_begin("fetch_response", prof_colors[0], false); + float *buf = nullptr; + if(m_hyperslab_labels) { + assert_eq(Y.Height(), m_num_features); + const std::string conduit_key = LBANN_DATA_ID_STR(data_id); + conduit::Node node; + const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); + node.set_external(ds_node); + conduit::Node slab; + slab.set_external(node[conduit_key + "/responses_slab"]); + prof_region_end("set_external", false); + buf = slab.value(); + std::memcpy(Y.Buffer(), buf, m_num_features*sizeof(TensorDataType)); + } else { + assert_eq(Y.Height(), m_all_responses.size()); + if (data_store_active()) { + conduit::Node node; + const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); + node.set_external(ds_node); + const std::string conduit_obj = LBANN_DATA_ID_STR(data_id); + buf = node[conduit_obj+"/responses"].value(); + }else { + buf = &m_all_responses[0]; + } + std::memcpy(Y.Buffer(), buf, + m_all_responses.size()*sizeof(DataType)); + if (dc::get_rank_stride() == 1) { + gather_responses(Y.Buffer()); + } + } + prof_region_end("fetch_response", false); + return true; +} + +// Gather scattered responses to the first N ranks, where N is the +// mini-batch size. This is not necessary when the rank reordering +// is used. +template +void hdf5_reader::gather_responses(float *responses) { + float recv_buf[m_all_responses.size()]; + const int rank = dc::get_mpi_rank(); + const int num_part = dc::get_number_of_io_partitions(); + const int mini_batch_size = this->get_loaded_mini_batch_size(); + const int src_rank = rank * num_part; + const int dst_rank = rank / num_part; + const int tag = 0; + int req_idx = 0; + MPI_Request req[2]; + + // send + if (rank % num_part == 0) { + MPI_Isend(responses, m_all_responses.size(), MPI_FLOAT, dst_rank, + tag, m_response_gather_comm, &req[req_idx]); + ++req_idx; + } + + // recv + if (rank < mini_batch_size) { + MPI_Irecv(recv_buf, m_all_responses.size(), MPI_FLOAT, src_rank, tag, + m_response_gather_comm, &req[req_idx]); + ++req_idx; + } + + if (req_idx > 0) { + MPI_Waitall(req_idx, req, MPI_STATUS_IGNORE); + } + + std::memcpy(responses, recv_buf, sizeof(float) * m_all_responses.size()); +} + +template<> hid_t hdf5_reader::get_hdf5_data_type() const { + return H5T_NATIVE_FLOAT; +} +template<> hid_t hdf5_reader::get_hdf5_data_type() const { + return H5T_NATIVE_DOUBLE; +} +template<> hid_t hdf5_reader::get_hdf5_data_type() const { + return H5T_NATIVE_SHORT; +} + +template<> conduit::DataType hdf5_reader::get_conduit_data_type(conduit::index_t num_elements) const { + return conduit::DataType::float32(num_elements); +} +template<> conduit::DataType hdf5_reader::get_conduit_data_type(conduit::index_t num_elements) const { + return conduit::DataType::float64(num_elements); +} +template<> conduit::DataType hdf5_reader::get_conduit_data_type(conduit::index_t num_elements) const { + return conduit::DataType::int16(num_elements); +} + +// TODO (oyamay): Instantiate hdf5_reader for large samples +#define PROTO(T) template class hdf5_reader; + +#include "lbann/macros/instantiate.hpp" + +} // namespace lbann diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 593a7413823..a11d2d078c2 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -49,19 +49,26 @@ image_data_reader::image_data_reader(const image_data_reader& rhs) } image_data_reader& image_data_reader::operator=(const image_data_reader& rhs) { + if (this == &rhs) { + return (*this); + } generic_data_reader::operator=(rhs); m_image_dir = rhs.m_image_dir; - m_image_list = rhs.m_image_list; + m_labels = rhs.m_labels; m_image_width = rhs.m_image_width; m_image_height = rhs.m_image_height; m_image_num_channels = rhs.m_image_num_channels; m_image_linearized_size = rhs.m_image_linearized_size; m_num_labels = rhs.m_num_labels; + m_sample_list.copy(rhs.m_sample_list); return (*this); } void image_data_reader::copy_members(const image_data_reader &rhs) { + if (this == &rhs) { + return; + } if(rhs.m_data_store != nullptr) { m_data_store = new data_store_conduit(rhs.get_data_store()); @@ -69,12 +76,13 @@ void image_data_reader::copy_members(const image_data_reader &rhs) { } m_image_dir = rhs.m_image_dir; - m_image_list = rhs.m_image_list; + m_labels = rhs.m_labels; m_image_width = rhs.m_image_width; m_image_height = rhs.m_image_height; m_image_num_channels = rhs.m_image_num_channels; m_image_linearized_size = rhs.m_image_linearized_size; m_num_labels = rhs.m_num_labels; + m_sample_list.copy(rhs.m_sample_list); //m_thread_cv_buffer = rhs.m_thread_cv_buffer } @@ -118,7 +126,10 @@ void image_data_reader::set_input_params(const int width, const int height, cons } bool image_data_reader::fetch_label(CPUMat& Y, int data_id, int mb_idx) { - const label_t label = m_image_list[data_id].second; + if (static_cast(data_id) >= m_labels.size()) { + LBANN_ERROR("Cannot find label for sample " + std::to_string(data_id) + "."); + } + const label_t label = m_labels[data_id]; if (label < label_t{0} || label >= static_cast(m_num_labels)) { LBANN_ERROR( "\"",this->get_type(),"\" data reader ", @@ -129,32 +140,55 @@ bool image_data_reader::fetch_label(CPUMat& Y, int data_id, int mb_idx) { return true; } +void image_data_reader::dump_sample_label_list(const std::string& dump_file_name) { + std::ofstream os(dump_file_name); + const auto num_samples = m_sample_list.size(); + for (size_t i = 0ul; i < num_samples; ++i) { + const auto file_id = m_sample_list[i].first; + const std::string filename = m_sample_list.get_samples_filename(file_id); + os << filename << ' ' << std::to_string(m_labels[i]) << std::endl; + } +} + void image_data_reader::load() { options *opts = options::get(); - const std::string imageListFile = get_data_filename(); + // Load sample list + const std::string sample_list_file = get_data_sample_list(); - // load image list - m_image_list.clear(); - FILE *fplist = fopen(imageListFile.c_str(), "rt"); - if (!fplist) { - LBANN_ERROR("failed to open: " + imageListFile + " for reading"); - } - while (!feof(fplist)) { - char imagepath[512]; - label_t imagelabel; - if (fscanf(fplist, "%s%d", imagepath, &imagelabel) <= 1) { - break; + if (sample_list_file.empty()) { + gen_list_of_samples(); + } else { + load_list_of_samples(sample_list_file); + } + + if (opts->has_string("write_sample_list") && m_comm->am_trainer_master()) { + const std::string slist_name = (m_sample_list.get_header()).get_sample_list_name(); + std::stringstream s; + std::string basename = get_basename_without_ext(slist_name); + std::string ext = get_ext_name(slist_name); + s << basename << "." << ext; + { + const std::string msg = " writing sample list '" + slist_name + + "' as '" + s.str() + "'"; + LBANN_WARNING(msg); + } + m_sample_list.write(s.str()); + } + if (opts->has_string("write_sample_label_list") && m_comm->am_trainer_master()) { + if (!(m_keep_sample_order || opts->has_string("keep_sample_order"))) { + std::cout << "Writting sample label list without the option " + << "`keep_sample_order' set." << std::endl; } - m_image_list.emplace_back(imagepath, imagelabel); + std::string dump_file = "image_list.trainer" + + std::to_string(m_comm->get_trainer_rank()) + + "." + this->get_role() + ".txt"; + dump_sample_label_list(dump_file); } - fclose(fplist); - // TODO: this will probably need to change after sample_list class - // is modified // reset indices m_shuffled_indices.clear(); - m_shuffled_indices.resize(m_image_list.size()); + m_shuffled_indices.resize(m_sample_list.size()); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); resize_shuffled_indices(); @@ -164,6 +198,15 @@ void image_data_reader::load() { select_subset_of_data(); } +image_data_reader::sample_t image_data_reader::get_sample(const size_t idx) const { + if (idx >= m_labels.size()) { + LBANN_ERROR("Cannot find label for sample " + std::to_string(idx) + "."); + } + const auto sample_name = m_sample_list[idx].second; + const auto label = m_labels[idx]; + return sample_t(sample_name, label); +} + void read_raw_data(const std::string &filename, std::vector &data) { data.clear(); std::ifstream in(filename.c_str()); @@ -215,7 +258,6 @@ void image_data_reader::do_preload_data_store() { load_conduit_nodes_from_file(data_ids[io_thread_pool->get_local_thread_id()]); io_thread_pool->finish_work_group(); } - else { if (is_master()) { std::cout << "mode: NOT data_store_thread\n"; @@ -240,12 +282,6 @@ void image_data_reader::setup(int num_io_threads, observer_ptr io_t static_cast(m_image_width)}); } -std::vector image_data_reader::get_image_list_of_current_mb() const { - std::vector ret; - ret.reserve(m_mini_batch_size); - return ret; -} - bool image_data_reader::load_conduit_nodes_from_file(const std::unordered_set &data_ids) { for (auto data_id : data_ids) { conduit::Node &node = m_data_store->get_empty_node(data_id); @@ -257,8 +293,15 @@ bool image_data_reader::load_conduit_nodes_from_file(const std::unordered_set(data_id) >= m_labels.size()) { + LBANN_ERROR("Cannot find label for sample " + std::to_string(data_id) + "."); + } + const label_t label = m_labels[data_id]; + std::vector data; read_raw_data(filename, data); node[LBANN_DATA_ID_STR(data_id) + "/label"].set(label); @@ -266,5 +309,291 @@ void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node & node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"] = data.size(); } +/// Allow streams to be constructed on an existing data buffer without copying +template > +class vectorwrapbuf : public std::basic_streambuf { +public: + vectorwrapbuf(std::vector &vec) { + this->setg(vec.data(), vec.data(), vec.data() + vec.size()); + } +}; + +/** + * Load a sample list and then load labels from a separate file using `load_labels()` + * With the command line option `--load_full_sample_list_once`, the trainer master + * first loads the entire sample list file into a memory buffer, and broadcasts it + * to the other workers within the trainer. Then, the sample list is populated + * using the buffer content. Otherwise, the sample list is directly read from the + * file. The prototext variable `data_filedir` when specified overrides the base + * location of data files, written in the header of the sample list file. + * The option `keep_sample_order` from the command line or data reader prototexts, + * makes sure the order of samples in the list remains the same even with loading + * in an interleaving order by multiple trainer workers. + */ +void image_data_reader::load_list_of_samples(const std::string sample_list_file) { + // load the sample list + double tm1 = get_time(); + + options *opts = options::get(); + + if (m_keep_sample_order || opts->has_string("keep_sample_order")) { + m_sample_list.keep_sample_order(true); + } else { + m_sample_list.keep_sample_order(false); + } + + if (opts->get_bool("check_data")) { + m_sample_list.set_data_file_check(); + } + + std::vector buffer; + + if (opts->has_string("load_full_sample_list_once")) { + if (m_comm->am_trainer_master()) { + load_file(sample_list_file, buffer); + } + m_comm->trainer_broadcast(m_comm->get_trainer_master(), buffer); + + vectorwrapbuf strmbuf(buffer); + std::istream iss(&strmbuf); + + m_sample_list.set_sample_list_name(sample_list_file); + m_sample_list.load(iss, *m_comm, true); + } else { + m_sample_list.load(sample_list_file, *m_comm, true); + } + + double tm2 = get_time(); + + if (is_master()) { + std::cout << "Time to load sample list '" << sample_list_file << "': " + << tm2 - tm1 << std::endl; + } + + /// Merge all the sample list pieces from the workers within the trainer + m_sample_list.all_gather_packed_lists(*m_comm); + set_file_dir(m_sample_list.get_samples_dirname()); + + double tm3 = get_time(); + if(is_master()) { + std::cout << "Time to gather sample list '" << sample_list_file << "': " + << tm3 - tm2 << std::endl; + } + buffer.clear(); + buffer.shrink_to_fit(); + + std::vector empty_buffer; + load_labels(empty_buffer); +} + +void image_data_reader::load_list_of_samples_from_archive(const std::string& sample_list_archive) { + // load the sample list + double tm1 = get_time(); + std::stringstream ss(sample_list_archive); // any stream can be used + + cereal::BinaryInputArchive iarchive(ss); // Create an input archive + + iarchive(m_sample_list); // Read the data from the archive + double tm2 = get_time(); + + if (is_master()) { + std::cout << "Time to load sample list from archive: " << tm2 - tm1 << std::endl; + } +} + +/** + * Similar to `load_list_of_samples()` but generates the sample list header + * on-the-fly, and reuse the original imagenet data list file for loading both + * the sample list and the label list, of which path is specified via the + * prototext variable `data_filedir`. This is for the backward compatibility + * and allows users to use the old data reader prototext without preparing a + * sample list and modifying the prototext. The base location of data files + * is specified via `data_filedir` prototext variable as it was. + */ +void image_data_reader::gen_list_of_samples() { + // load the sample list + double tm1 = get_time(); + + // The original imagenet data file specified via the prototext variable + // `data_filename` + const std::string imageListFile = get_data_filename(); + + sample_list_header header; // A sample list header being generated + header.set_sample_list_type(lbann::single_sample); + header.set_data_file_dir(get_file_dir()); + header.set_label_filename(imageListFile); + const std::string sample_list_file = imageListFile; + header.set_sample_list_name(sample_list_file); + + options *opts = options::get(); + + if (m_keep_sample_order || opts->has_string("keep_sample_order")) { + m_sample_list.keep_sample_order(true); + } else { + m_sample_list.keep_sample_order(false); + } + + if (opts->get_bool("check_data")) { + m_sample_list.set_data_file_check(); + } + + std::vector buffer; + + if (opts->has_string("load_full_sample_list_once")) { + // The trainer master loads the entire file into a buffer in the memory + if (m_comm->am_trainer_master()) { + load_file(imageListFile, buffer); + } + // Broadcast the buffer to workers within this trainer + m_comm->trainer_broadcast(m_comm->get_trainer_master(), buffer); + + // The trainer master counts the number of samples (lines) and broadcasts + // the result + size_t num_samples = 0ul; + if (m_comm->am_trainer_master()) { + vectorwrapbuf strmbuf(buffer); + std::istream iss(&strmbuf); + num_samples = determine_num_of_samples(iss); + } + m_comm->trainer_broadcast(m_comm->get_trainer_master(), num_samples); + header.set_sample_count(std::to_string(num_samples)); + + // Populate the sample list using the generated header and the preloaded buffer + vectorwrapbuf strmbuf(buffer); + std::istream iss(&strmbuf); + m_sample_list.load(header, iss, *m_comm, true); + } else { + // The trainer master counts the number of samples (lines) and broadcasts + // the result + size_t num_samples = 0ul; + if (m_comm->am_trainer_master()) { + std::ifstream iss(imageListFile); + num_samples = determine_num_of_samples(iss); + } + m_comm->trainer_broadcast(m_comm->get_trainer_master(), num_samples); + header.set_sample_count(std::to_string(num_samples)); + + // Populate the sample list using the generated header and the original + // imagenet data list file + std::ifstream iss(imageListFile); + m_sample_list.load(header, iss, *m_comm, true); + } + + double tm2 = get_time(); + + if (is_master()) { + std::cout << "Time to load sample list '" << sample_list_file << "': " + << tm2 - tm1 << std::endl; + } + + /// Merge all the sample list pieces from the workers within the trainer + m_sample_list.all_gather_packed_lists(*m_comm); + + double tm3 = get_time(); + if(is_master()) { + std::cout << "Time to gather sample list '" << sample_list_file << "': " + << tm3 - tm2 << std::endl; + } + // Reuse the preloaded buffer for obtaining labels when possible + load_labels(buffer); +} + +/// Populate the sample label vector out of the given input stream +void image_data_reader::read_labels(std::istream& istrm) { + const std::string whitespaces(" \t\f\v\n\r"); + const size_t num_samples = m_sample_list.size(); + + // To help populating the label list, build a map from a sample name to + // the index of the corresponding item in the sample list + m_sample_list.build_sample_map_from_name_to_index(); + + options *opts = options::get(); + const bool check_data = opts->get_bool("check_data"); + + m_labels.clear(); + m_labels.resize(num_samples); + std::unordered_set idx_set; + + std::string line; + + while (std::getline(istrm, line)) { + const size_t end_of_str = line.find_last_not_of(whitespaces); + if (end_of_str == std::string::npos) { // empty line + continue; + } + + // clear trailing spaces for accurate parsing + std::stringstream sstr(line.substr(0, end_of_str + 1)); + std::string sname; + label_t label; + + sstr >> sname >> label; + + // Translate the sample name into the index into the sample list + const auto sample_idx = m_sample_list.get_sample_index(sample_name_t(sname)); + if (sample_idx >= num_samples) { + continue; + } + if (check_data) { + idx_set.insert(sample_idx); + } + m_labels[sample_idx] = label; + } + + // Free the memory of the temporary map + m_sample_list.clear_sample_map_from_name_to_index(); + + if (check_data && (num_samples != idx_set.size())) { + LBANN_ERROR("The number of samples is different from the number of labels: ", + std::to_string(num_samples), + " != ", + std::to_string(idx_set.size())); + } +} + +/** + * Load the sample labels either from a file or from a preloaded buffer. + * If the buffer given is empty, the label file specified in the sample list + * header is used. + */ +void image_data_reader::load_labels(std::vector& preloaded_buffer) { + const std::string imageListFile = m_sample_list.get_label_filename(); + + double tm1 = get_time(); + + if (preloaded_buffer.empty()) { // read labels from a file + std::string line; + std::ifstream is; + is.open(imageListFile); + if (is.fail()) { + LBANN_ERROR("failed to open: " + imageListFile + " for reading"); + } + read_labels(is); + } else { // read labels from a preloaded buffer + vectorwrapbuf strmbuf(preloaded_buffer); + std::istream is(&strmbuf); + read_labels(is); + } + + if (is_master()) { + std::cout << "Time to load label file '" << imageListFile << "': " + << get_time() - tm1 << std::endl; + } +} + +size_t image_data_reader::determine_num_of_samples(std::istream& istrm) const { + const std::string whitespaces(" \t\f\v\n\r"); + size_t cnt = 0ul; + std::string line; + + while (std::getline(istrm, line)) { + const size_t end_of_str = line.find_last_not_of(whitespaces); + if (end_of_str == std::string::npos) { // empty line + continue; + } + cnt ++; + } + return cnt; +} } // namespace lbann diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp index 0d83fc679ad..2089e4f9a22 100644 --- a/src/data_readers/data_reader_imagenet.cpp +++ b/src/data_readers/data_reader_imagenet.cpp @@ -54,7 +54,10 @@ CPUMat imagenet_reader::create_datum_view(CPUMat& X, const int mb_idx) const { bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { El::Matrix image; std::vector dims; - const std::string image_path = get_file_dir() + m_image_list[data_id].first; + const auto file_id = m_sample_list[data_id].first; + const std::string filename = m_sample_list.get_samples_filename(file_id); + const std::string image_path = get_file_dir() + filename; + if (m_data_store != nullptr) { bool have_node = true; conduit::Node node; diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 76eb78685c8..6266e537b0e 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -54,6 +54,7 @@ #include #include +#include // This comes after all the headers, and is only visible within the current implementation file. // To make sure, we put '#undef _CN_' at the end of this file @@ -786,58 +787,15 @@ void data_reader_jag_conduit::load() { if(is_master()) { std::cout << "data_reader_jag_conduit - starting load" << std::endl; } - const std::string data_dir = add_delimiter(get_file_dir()); - const std::string sample_list_file = data_dir + get_data_index_list(); + const std::string sample_list_file = get_data_sample_list(); - options *opts = options::get(); - bool check_data = opts->get_bool("check_data"); - - /// The use of these flags need to be updated to properly separate - /// how index lists are used between trainers and models - /// @todo m_list_per_trainer || m_list_per_model - double tm2 = get_time(); - load_list_of_samples(sample_list_file, m_comm->get_procs_per_trainer(), m_comm->get_rank_in_trainer()); - if(is_master()) { - std::cout << "Finished loading sample list; time: " << get_time() - tm2 << std::endl; - if (!check_data) { - std::cout << "Skipping check data" << std::endl; - } + if (sample_list_file.empty()) { + LBANN_ERROR("sample list is not specified."); } - /// Check the data that each rank loaded - if (!m_is_data_loaded && !m_sample_list.empty()) { - m_is_data_loaded = true; - - /// Open the first sample to make sure that all of the fields are correct - m_sample_list.open_samples_file_handle(0, true); + load_list_of_samples(sample_list_file); - if (m_scalar_keys.size() == 0u) { - set_all_scalar_choices(); // use all by default if none is specified - } - if (check_data) { - check_scalar_keys(); - } - - if (m_input_keys.size() == 0u) { - set_all_input_choices(); // use all by default if none is specified - } - if (check_data) { - check_input_keys(); - } - - if (check_data) { - check_image_data(); - } - - m_sample_list.close_if_done_samples_file_handle(0); - } - if(is_master()) { - std::cout << "Done with data checking" << std::endl; - } - - /// Merge all of the sample lists - tm2 = get_time(); - m_sample_list.all_gather_packed_lists(*m_comm); + options *opts = options::get(); if (opts->has_string("write_sample_list") && m_comm->am_trainer_master()) { { const std::string msg = " writing sample list " + sample_list_file; @@ -849,18 +807,12 @@ void data_reader_jag_conduit::load() { s << basename << "." << ext; m_sample_list.write(s.str()); } - if (is_master()) { - std::cout << "time for all_gather_packed_lists: " << get_time() - tm2 << std::endl; - } + m_shuffled_indices.clear(); m_shuffled_indices.resize(m_sample_list.size()); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); resize_shuffled_indices(); - if(is_master()) { - std::cout << "Lists have been gathered" << std::endl; - } - instantiate_data_store(); select_subset_of_data(); } @@ -926,14 +878,103 @@ void data_reader_jag_conduit::do_preload_data_store() { } } -void data_reader_jag_conduit::load_list_of_samples(const std::string sample_list_file, size_t stride, size_t offset) { +void data_reader_jag_conduit::sample_schema_check(const bool check_data) { + /// Check the data that each rank loaded + if (!m_is_data_loaded && !m_sample_list.empty()) { + m_is_data_loaded = true; + + /// Open the first sample to make sure that all of the fields are correct + m_sample_list.open_samples_file_handle(0, true); + + if (m_scalar_keys.size() == 0u) { + set_all_scalar_choices(); // use all by default if none is specified + } + if (check_data) { + check_scalar_keys(); + } + + if (m_input_keys.size() == 0u) { + set_all_input_choices(); // use all by default if none is specified + } + if (check_data) { + check_input_keys(); + } + + if (check_data) { + check_image_data(); + } + + m_sample_list.close_if_done_samples_file_handle(0); + } +} + +template > +class vectorwrapbuf : public std::basic_streambuf { +public: + vectorwrapbuf(std::vector &vec) { + this->setg(vec.data(), vec.data(), vec.data() + vec.size()); + } +}; + +void data_reader_jag_conduit::load_list_of_samples(const std::string sample_list_file) { // load the sample list double tm1 = get_time(); - m_sample_list.load(sample_list_file, stride, offset); + + options *opts = options::get(); + + if (this->m_keep_sample_order || opts->has_string("keep_sample_order")) { + m_sample_list.keep_sample_order(true); + } else { + m_sample_list.keep_sample_order(false); + } + + const bool check_data = opts->get_bool("check_data"); + + if (check_data) { + m_sample_list.set_data_file_check(); + } + + std::vector buffer; + + if (opts->has_string("load_full_sample_list_once")) { + if (m_comm->am_trainer_master()) { + load_file(sample_list_file, buffer); + } + m_comm->trainer_broadcast(m_comm->get_trainer_master(), buffer); + + vectorwrapbuf strmbuf(buffer); + std::istream iss(&strmbuf); + + m_sample_list.set_sample_list_name(sample_list_file); + m_sample_list.load(iss, *(this->m_comm), true); + } else { + m_sample_list.load(sample_list_file, *(this->m_comm), true); + } + double tm2 = get_time(); if (is_master()) { - std::cout << "Time to load sample list: " << tm2 - tm1 << std::endl; + std::cout << "Time to load sample list '" << sample_list_file << "': " << tm2 - tm1 << std::endl; + } + + sample_schema_check(check_data); + + double tm3 = get_time(); + if (is_master()) { + if (!check_data) { + std::cout << "Skip data checking" << std::endl; + } else { + std::cout << "Time to check sample data: " << tm3 - tm2 << std::endl; + } + } + + /// Merge all of the sample lists + m_sample_list.all_gather_packed_lists(*m_comm); + set_file_dir(m_sample_list.get_samples_dirname()); + + double tm4 = get_time(); + if(is_master()) { + std::cout << "Time to gather sample list '" << sample_list_file << "': " << tm4 - tm3 << std::endl; } } diff --git a/src/data_readers/data_reader_smiles.cpp b/src/data_readers/data_reader_smiles.cpp index 45ee74e1fcb..492c8ca5e54 100644 --- a/src/data_readers/data_reader_smiles.cpp +++ b/src/data_readers/data_reader_smiles.cpp @@ -134,11 +134,21 @@ void smiles_data_reader::load() { // Use two loops here, to assure all trainers have // the same number of samples + // ensure then number of samples is evenly divisible by + // the number of trainers + size_t n = m_shuffled_indices.size() / num_trainers; + size_t s3 = n*num_trainers; + if (m_shuffled_indices.size() != s3) { + if (is_master()) { + std::cout << "adjusting global sample size from " << m_shuffled_indices.size() << " to " << s3 << std::endl; + } + m_shuffled_indices.resize(s3); + } for (size_t j=0; j sanity_max) sanity_max = id; - if (m_data_store->get_index_owner(id) != m_comm->get_rank_in_world()) { + if (m_data_store->get_index_owner(id) != m_comm->get_rank_in_trainer()) { continue; } valid_ids.insert(id); } int max_index = sanity_max; - // cheap sanity check + // cheap sanity check if ( (sanity_min != m_min_index || sanity_max != m_max_index) && get_role() == "train") { @@ -267,7 +277,7 @@ bool smiles_data_reader::fetch_datum(Mat& X, int data_id, int mb_idx) { // no data_store: all data is stored locally if (m_data_store == nullptr) { get_sample(data_id, data); - data_ptr = data.data(); + data_ptr = data.data(); sz = data.size(); } @@ -287,10 +297,10 @@ bool smiles_data_reader::fetch_datum(Mat& X, int data_id, int mb_idx) { data_ptr = data.data(); sz = data.size(); } - + size_t j; for (j = 0; j < sz; ++j) { - X(j, mb_idx) = data_ptr[j]; + X(j, mb_idx) = data_ptr[j]; } for (; j(m_linearized_data_size); j++) { X(j, mb_idx) = m_pad; @@ -323,11 +333,11 @@ void smiles_data_reader::print_statistics() const { std::cout << "max sequence length: " << utils::commify(m_linearized_data_size) << std::endl; std::cout << "num features=" << utils::commify(m_linearized_data_size) << std::endl; if (m_delimiter == '\t') { - std::cout << "delimiter: \n"; + std::cout << "delimiter: \n"; } else if (m_delimiter == ',') { - std::cout << "delimiter: \n"; + std::cout << "delimiter: \n"; } else if (m_delimiter == '\0') { - std::cout << "delimiter: \n"; + std::cout << "delimiter: \n"; } else { LBANN_ERROR("invalid delimiter character, as int: ", (int)m_delimiter); } @@ -356,7 +366,7 @@ void smiles_data_reader::load_vocab() { if (token.size() == 1) { m_vocab[token[0]] = id; m_vocab_inv[id] = token[0]; - } + } if (token == "") { m_pad = id; --sanity; @@ -401,7 +411,7 @@ int smiles_data_reader::get_num_lines(std::string fn) { } in.close(); - std::cout << "smiles_data_reader::get_num_lines; num_lines: " + std::cout << "smiles_data_reader::get_num_lines; num_lines: " << utils::commify(count) << " time: " << get_time()-tm1 << std::endl; } @@ -419,8 +429,8 @@ int smiles_data_reader::get_num_lines(std::string fn) { int n_lines = INT_MAX; if (opts->has_int("n_lines")) { n_lines = opts->get_int("n_lines"); - if(is_master() && count < n_lines) { - std::cout << "WARNING:: number of available samples (" << count + if(is_master() && count < n_lines) { + std::cout << "WARNING:: number of available samples (" << count << " ) in file " << fn << " is less than number of samples requested (" << n_lines << " ) I am returning number of available samples " << std::endl; } @@ -511,7 +521,7 @@ void smiles_data_reader::setup_local_cache() { double tm3 = get_time(); if (is_master()) { std::cout << "\nSTARTING smiles_data_reader::setup_fast_experimental() " << std::endl << std::endl; - } + } // This will hold: (dataum_id, datum_offset, datum length) for each sample std::vector sample_offsets(m_shuffled_indices.size()*3); @@ -531,15 +541,15 @@ void smiles_data_reader::setup_local_cache() { std::string line; if (m_has_header) { getline(in, line); - } + } // Part 1: compute memory requirements for local cache - // Get max sample id, which will be the number of lines we need to + // Get max sample id, which will be the number of lines we need to // read from file. This is needed if (1) not using 100% of data, // and/or (2) carving off part of train data to use as validation. std::unordered_set samples_to_use; - int max_sample_id = 0; + int max_sample_id = 0; for (size_t j=0; j max_sample_id ? m_shuffled_indices[j] : max_sample_id; @@ -568,8 +578,8 @@ void smiles_data_reader::setup_local_cache() { // Part 2: Fill in the data buffer in.seekg(0); if (m_has_header) { - getline(in, line); - } + getline(in, line); + } offset = 0; for (int j=0; jbroadcast(0, sample_offsets.data(), sample_offsets.size(), m_comm->get_world_comm()); for (size_t j=0; jget_rank_in_world() != 1) { return; } - + // option: testing the test ;) bool fail = options::get()->get_bool("make_test_fail"); @@ -676,7 +686,7 @@ void smiles_data_reader::test_encode() { ++num_tested; // encode then decode the datum that is stored in memory get_sample(sample_id, encoded); - decode_smiles(encoded, decoded); + decode_smiles(encoded, decoded); // get datum length from the line we've just read from file size_t k = get_smiles_string_length(line, sample_id); @@ -688,7 +698,7 @@ void smiles_data_reader::test_encode() { if (num_tested > 10 && fail) { for (size_t h=0; h>> TESTS PASSED <<< " << std::endl; } @@ -722,7 +732,7 @@ void smiles_data_reader::decode_smiles(const std::vector &data, std::stri for (auto tt : data) { s2 << tt << " "; } - s2 << "; m_vocab_inv.size(): " << m_vocab_inv.size() + s2 << "; m_vocab_inv.size(): " << m_vocab_inv.size() << " m_vocab_inv keys: "; for (auto tt : m_vocab_inv) { s2 << tt.first << " "; @@ -735,7 +745,7 @@ void smiles_data_reader::decode_smiles(const std::vector &data, std::stri s << ""; } else if (!(x == "" || x == "" || x == "")) { s << m_vocab_inv[t]; - } + } } out = s.str(); } @@ -766,7 +776,7 @@ void smiles_data_reader::get_delimiter() { break; default : LBANN_ERROR("Invalid delimiter character; should be 'c', 't', '0'; you passed: ", d); - } + } } if (is_master()) { std::cout << "USING delimiter character: (int)" << (int)m_delimiter << std::endl; diff --git a/src/data_readers/data_reader_synthetic.cpp b/src/data_readers/data_reader_synthetic.cpp index 7088b9b908b..012db31307a 100644 --- a/src/data_readers/data_reader_synthetic.cpp +++ b/src/data_readers/data_reader_synthetic.cpp @@ -74,6 +74,7 @@ bool data_reader_synthetic::fetch_label(CPUMat& Y, int data_id, int mb_idx) { if (m_num_labels == 0) { LBANN_ERROR("Synthetic data reader does not have labels"); } + auto io_rng = set_io_generators_local_index(0); Y.Set(fast_rand_int(get_fast_io_generator(), m_num_labels), mb_idx, 1); return true; } diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index f19413f1117..27295025bae 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -32,6 +32,7 @@ #include "lbann/utils/exception.hpp" #include "lbann/utils/options.hpp" #include "lbann/utils/timer.hpp" +#include "lbann/utils/distconv.hpp" #include "lbann/utils/file_utils.hpp" #include "lbann/utils/commify.hpp" #include @@ -61,11 +62,20 @@ data_store_conduit::data_store_conduit( LBANN_ERROR("m_comm is nullptr"); } +#ifdef LBANN_HAS_DISTCONV + int num_io_parts = dc::get_number_of_io_partitions(); +#else + int num_io_parts = 1; +#endif // LBANN_HAS_DISTCONV + m_world_master = m_comm->am_world_master(); m_trainer_master = m_comm->am_trainer_master(); m_rank_in_trainer = m_comm->get_rank_in_trainer(); m_rank_in_world = m_comm->get_rank_in_world(); + m_partition_in_trainer = m_rank_in_trainer/num_io_parts; // needs a better name which group you are in + m_offset_in_partition = m_rank_in_trainer%num_io_parts; m_np_in_trainer = m_comm->get_procs_per_trainer(); + m_num_partitions_in_trainer = m_np_in_trainer/num_io_parts; // rename this m_num_io_groups_in_trainer open_informational_files(); @@ -79,10 +89,10 @@ data_store_conduit::data_store_conduit( if (opts->has_string("data_store_test_checkpoint") && opts->has_string("data_store_spill")) { LBANN_ERROR("you passed both --data_store_test_checkpoint and --data_store_spill; please use one or the other or none, but not both"); - } + } if (opts->has_string("data_store_test_checkpoint")) { setup_checkpoint_test(); - } + } if (opts->has_string("data_store_spill")) { setup_spill(opts->get_string("data_store_spill")); } @@ -90,7 +100,7 @@ data_store_conduit::data_store_conduit( set_is_local_cache(opts->get_bool("data_store_cache")); set_is_preloading(opts->get_bool("preload_data_store")); set_is_explicitly_loading(! is_preloading()); - + if (is_local_cache()) { PROFILE("data_store_conduit is running in local_cache mode"); } else { @@ -128,7 +138,7 @@ void data_store_conduit::setup_checkpoint_test() { std::string c = options::get()->get_string("data_store_test_checkpoint"); if (c == "1") { LBANN_ERROR("--data_store_test_checkpoint=1; you probably forgot to specify the spill directory; you must specify --data_store_test_checkpoint='"); - } + } if (c == "lassen") { c = get_lassen_spill_dir(); } @@ -161,8 +171,8 @@ data_store_conduit& data_store_conduit::operator=(const data_store_conduit& rhs) return (*this); } -void data_store_conduit::set_data_reader_ptr(generic_data_reader *reader) { - m_reader = reader; +void data_store_conduit::set_data_reader_ptr(generic_data_reader *reader) { + m_reader = reader; m_debug = 0; m_profile = 0; open_informational_files(); @@ -183,7 +193,11 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs) { m_world_master = rhs.m_world_master; m_trainer_master = rhs.m_trainer_master; m_rank_in_trainer = rhs.m_rank_in_trainer; + m_rank_in_world = rhs.m_rank_in_world; + m_partition_in_trainer = rhs.m_partition_in_trainer; + m_offset_in_partition = rhs.m_offset_in_partition; m_np_in_trainer = rhs.m_np_in_trainer; + m_num_partitions_in_trainer = rhs.m_num_partitions_in_trainer; m_owner = rhs.m_owner; m_shuffled_indices = rhs.m_shuffled_indices; m_sample_sizes = rhs.m_sample_sizes; @@ -244,7 +258,7 @@ void data_store_conduit::setup_data_store_buffers() { void data_store_conduit::spill_preloaded_conduit_node(int data_id, const conduit::Node &node) { // note: at this point m_data[data_id] = node conduit::Node n3 = node; - { + { std::lock_guard lock(m_mutex); build_node_for_sending(node, n3); } @@ -277,7 +291,7 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id, const conduit:: if (is_local_cache()) { std::lock_guard lock(m_mutex); ++m_my_num_indices; - m_data[data_id] = node; + m_data[data_id] = node; return; } @@ -287,7 +301,7 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id, const conduit:: return; } - { + { conduit::Node n2 = node; // node == m_data[data_id] std::lock_guard lock(m_mutex); build_node_for_sending(n2, m_data[data_id]); @@ -380,7 +394,8 @@ void data_store_conduit::set_conduit_node(int data_id, const conduit::Node &node { // std::lock_guard lock(m_mutex); LBANN_ERROR("NOT YET IMPLEMENTED"); - m_owner[data_id] = m_rank_in_trainer; + auto key = std::make_pair(data_id, m_offset_in_partition); + m_owner[key] = m_rank_in_trainer; m_sample_sizes[data_id] = n2.total_bytes_compact(); spill_conduit_node(node, data_id); m_spilled_nodes[data_id] = m_cur_spill_dir_integer; @@ -388,14 +403,15 @@ void data_store_conduit::set_conduit_node(int data_id, const conduit::Node &node } else { - { - // std::lock_guard lock(m_mutex); - m_owner[data_id] = m_rank_in_trainer; - build_node_for_sending(node, m_data[data_id]); - m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); - } + // m_mutex.lock(); + DEBUG_DS("set_conduit_node : rank_in_trainer=", m_rank_in_trainer, " and partition_in_trainer=", m_partition_in_trainer, " offset in partition=", m_offset_in_partition, " with num_partitions=", m_num_partitions_in_trainer); + auto key = std::make_pair(data_id, m_offset_in_partition); + m_owner[key] = m_rank_in_trainer; + build_node_for_sending(node, m_data[data_id]); + m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); error_check_compacted_node(m_data[data_id], data_id); - } + // m_mutex.unlock(); + } } } @@ -578,6 +594,7 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s tm5 = get_time(); m_comm->wait_all(m_send_requests); m_comm->wait_all(m_recv_requests); + m_comm->trainer_barrier(); m_wait_all_time += (get_time() - tm5); //======================================================================== @@ -615,8 +632,14 @@ int data_store_conduit::build_indices_i_will_recv(int current_pos, int mb_size) int k = 0; for (int i=current_pos; i< current_pos + mb_size; ++i) { auto index = (*m_shuffled_indices)[i]; - if ((i % m_owner_map_mb_size) % m_np_in_trainer == m_rank_in_trainer) { - int owner = m_owner[index]; +#ifdef LBANN_HAS_DISTCONV + int num_ranks_in_partition = dc::get_number_of_io_partitions(); +#else + int num_ranks_in_partition = 1; +#endif // LBANN_HAS_DISTCONV + if ((((i % m_owner_map_mb_size) % m_num_partitions_in_trainer) * num_ranks_in_partition + m_offset_in_partition) == m_rank_in_trainer) { + auto key = std::make_pair(index, m_offset_in_partition); + int owner = m_owner[key]; m_indices_to_recv[owner].insert(index); k++; } @@ -639,11 +662,17 @@ int data_store_conduit::build_indices_i_will_send(int current_pos, int mb_size) is_mine = true; } if (is_mine) { - m_indices_to_send[(i % m_owner_map_mb_size) % m_np_in_trainer].insert(index); +#ifdef LBANN_HAS_DISTCONV + int num_ranks_in_partition = dc::get_number_of_io_partitions(); +#else + int num_ranks_in_partition = 1; +#endif // LBANN_HAS_DISTCONV + m_indices_to_send[(((i % m_owner_map_mb_size) % m_num_partitions_in_trainer) * num_ranks_in_partition + m_offset_in_partition)].insert(index); // Sanity check - if (m_owner[index] != m_rank_in_trainer) { - LBANN_ERROR( "error for i: ", i, " index: ", index, " m_owner: ", m_owner[index], " me: ", m_rank_in_trainer); + auto key = std::make_pair(index, m_offset_in_partition); + if (m_owner[key] != m_rank_in_trainer) { + LBANN_ERROR( "error for i: ", i, " index: ", index, " m_owner: ", m_owner[key], " me: ", m_rank_in_trainer); } k++; } @@ -662,7 +691,8 @@ void data_store_conduit::build_preloaded_owner_map(const std::vector& per_r ++owning_rank; per_rank_list_range_start += per_rank_list_size; } - m_owner[(*m_shuffled_indices)[i]] = owning_rank; + auto key = std::make_pair((*m_shuffled_indices)[i], m_offset_in_partition); + m_owner[key] = owning_rank; } PROFILE("build_preloaded_owner_map; m_owner_maps_were_exchanged = true"); m_owner_maps_were_exchanged = true; @@ -709,10 +739,11 @@ void data_store_conduit::compact_nodes() { } int data_store_conduit::get_index_owner(int idx) { - if (m_owner.find(idx) == m_owner.end()) { + auto key = std::make_pair(idx, m_offset_in_partition); + if (m_owner.find(key) == m_owner.end()) { LBANN_ERROR(" idx: ", idx, " was not found in the m_owner map; map size: ", m_owner.size()); } - return m_owner[idx]; + return m_owner[key]; } void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset) { @@ -941,7 +972,7 @@ void data_store_conduit::set_loading_is_complete() { } } -bool data_store_conduit::is_fully_loaded() const { +bool data_store_conduit::is_fully_loaded() const { if (m_loading_is_complete) { return true; } @@ -960,7 +991,7 @@ void data_store_conduit::get_image_sizes(map_is_t &file_sizes, std::vector(m_reader) failed"); } - const std::vector &image_list = image_reader->get_image_list(); + const auto& sample_list = image_reader->get_sample_list(); std::vector my_image_sizes; // this block fires if we're exchanging cache data at the end @@ -972,15 +1003,17 @@ void data_store_conduit::get_image_sizes(map_is_t &file_sizes, std::vectorsize(); h += m_np_in_trainer) { ++m_my_num_indices; - const std::string fn = m_reader->get_file_dir() + '/' + image_list[(*m_shuffled_indices)[h]].first; + const auto file_id = sample_list[(*m_shuffled_indices)[h]].first; + const std::string fn = m_reader->get_file_dir() + '/' + + sample_list.get_samples_filename(file_id); std::ifstream in(fn.c_str()); if (!in) { - LBANN_ERROR("failed to open ", fn, " for reading; file_dir: ", m_reader->get_file_dir(), " fn: ", image_list[h].first, "; role: ", m_reader->get_role()); + LBANN_ERROR("failed to open ", fn, " for reading ", fn, "; role: " + m_reader->get_role()); } in.seekg(0, std::ios::end); my_image_sizes.push_back((*m_shuffled_indices)[h]); @@ -1005,7 +1038,7 @@ void data_store_conduit::get_image_sizes(map_is_t &file_sizes, std::vector work(image_list.size()*2); + std::vector work(sample_list.size()*2); m_comm->trainer_all_gather(my_image_sizes, work, counts, disp); indices.resize(m_np_in_trainer); for (int h=0; h> indices; @@ -1180,7 +1213,7 @@ void data_store_conduit::read_files(std::vector &work, map_is_t &sizes, st //get the list of images from the data reader image_data_reader *image_reader = dynamic_cast(m_reader); - const std::vector &image_list = image_reader->get_image_list(); + const auto& sample_list = image_reader->get_sample_list(); //read the images size_t offset = 0; @@ -1188,7 +1221,9 @@ void data_store_conduit::read_files(std::vector &work, map_is_t &sizes, st for (size_t j=0; jget_file_dir() + '/' + image_list[idx].first; + const auto file_id = sample_list[idx].first; + const std::string fn = m_reader->get_file_dir() + '/' + + sample_list.get_samples_filename(file_id); std::ifstream in(fn, std::ios::in | std::ios::binary); in.read(work.data()+offset, s); in.close(); @@ -1198,10 +1233,10 @@ void data_store_conduit::read_files(std::vector &work, map_is_t &sizes, st void data_store_conduit::build_conduit_nodes(map_is_t &sizes) { image_data_reader *image_reader = dynamic_cast(m_reader); - const std::vector &image_list = image_reader->get_image_list(); for (auto t : sizes) { int data_id = t.first; - int label = image_list[data_id].second; + const auto sample = image_reader->get_sample(static_cast(data_id)); + const auto label = sample.second; if (m_image_offsets.find(data_id) == m_image_offsets.end()) { LBANN_ERROR("m_image_offsets.find(data_id) == m_image_offsets.end() for data_id: ", data_id); } @@ -1323,31 +1358,52 @@ void data_store_conduit::exchange_owner_maps() { m_comm->all_gather(&my_count, 1, all_counts.data(), 1, m_comm->get_trainer_comm()); std::vector my_sizes(m_my_num_indices); + std::vector> nodes_i_own(m_owner.size()); size_t j = 0; for (auto t : m_owner) { - my_sizes[j++] = t.first; + auto slab_id = std::make_pair(t.first.first, t.first.second); + nodes_i_own[j++] = slab_id; + DEBUG_DS("I am building the size vector from the owner map for ", t.first.first, ".", t.first.second, " and ", t.second); } - std::vector others; + std::vector> other_ranks_nodes; for (int k=0; kbroadcast(k, my_sizes.data(), all_counts[k], m_comm->get_trainer_comm()); + m_comm->broadcast>(k, nodes_i_own.data(), all_counts[k], m_comm->get_trainer_comm()); + if(m_debug) { + int c = 0; + for(auto i : nodes_i_own) { + DEBUG_DS("k=", k, ": nodes_i_own[", c, "]=", i.first, ".", i.second); + c++; + } + } } else { - m_comm->broadcast(k, others.data(), all_counts[k], m_comm->get_trainer_comm()); - for (size_t i=0; ibroadcast>(k, other_ranks_nodes.data(), all_counts[k], m_comm->get_trainer_comm()); + if(m_debug) { + int c = 0; + for(auto i : other_ranks_nodes) { + DEBUG_DS("k=", k, ": other_ranks_nodes[", c, "]=", i.first, ".", i.second); + c++; + } + } + for (size_t i=0; iget_role(), "; m_owner[", others[i],"] = ", m_owner[others[i]], " for role: ", m_reader->get_role(), " m_owner.size: ", m_owner.size(), " m_data.size(): ", m_data.size()); + LBANN_ERROR("duplicate data_id: ", other_ranks_nodes[i].first, ".", + other_ranks_nodes[i].second, " role: ", m_reader->get_role(), "; m_owner[",other_ranks_nodes[i].first, ".", other_ranks_nodes[i].second,"] = ", m_owner[key]); } - m_owner[others[i]] = k; + m_owner[key] = k; } } @@ -1435,7 +1491,7 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_ PROFILE(" is_fully_loaded: ", is_fully_loaded()); if (! is_local_cache()) { profile_timing(); - } + } } double tm1 = get_time(); @@ -1445,11 +1501,11 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_ PROFILE("calling exchange_owner_maps"); if (!m_owner_maps_were_exchanged) { exchange_owner_maps(); - } + } - else { + else { PROFILE(" owner_maps were already exchanged; returning"); - } + } m_owner_maps_were_exchanged = true; PROFILE("exchange_mini_batch_data; m_owner_maps_were_exchanged = true"); /* @@ -1458,7 +1514,7 @@ PROFILE("exchange_mini_batch_data; m_owner_maps_were_exchanged = true"); m_is_spilled = true; m_metadata.close(); save_state(); - } + } */ } @@ -1517,7 +1573,7 @@ void data_store_conduit::test_checkpoint(const std::string &checkpoint_dir) { } if (m_world_master) { - std::cout << "Cleared the owner map; m_owner.size(): " << m_owner.size() + std::cout << "Cleared the owner map; m_owner.size(): " << m_owner.size() << std::endl << "Calling load_checkpoint" << std::endl; } @@ -1532,9 +1588,9 @@ void data_store_conduit::test_checkpoint(const std::string &checkpoint_dir) { //check that the owner map was correctly loaded for (auto t : m_owner) { if (sanity.find(t.first) == sanity.end()) { - LBANN_ERROR("sanity.find(t.first) == sanity.end() for t.first= ", t.first); + LBANN_ERROR("sanity.find(t.first) == sanity.end() for t.first= ", t.first.first, ":", t.first.second); } else if (sanity[t.first] != m_owner[t.first]) { - LBANN_ERROR("sanity[t.first] != m_owner[t.first] for t.first= ", t.first, " and m_owner[t.first]= ", m_owner[t.first]); + LBANN_ERROR("sanity[t.first] != m_owner[t.first] for t.first= ", t.first.first, ":", t.first.second, " and m_owner[t.first]= ", m_owner[t.first]); } } @@ -1571,7 +1627,7 @@ void data_store_conduit::setup_spill(std::string base_dir) { // open metadata file; this will contains the file pathnames of spilled // conduit nodes const std::string fnn = get_metadata_fn(); - m_metadata.open(fnn.c_str()); + m_metadata.open(fnn.c_str()); if (!m_metadata) { LBANN_ERROR("failed to open ", fnn, " for writing"); } @@ -1614,15 +1670,15 @@ void data_store_conduit::save_state() { { cereal::XMLOutputArchive archive(os); archive(CEREAL_NVP(m_my_num_indices), - CEREAL_NVP(m_owner_maps_were_exchanged), + CEREAL_NVP(m_owner_maps_were_exchanged), CEREAL_NVP(m_is_setup), - CEREAL_NVP(m_preloading), - CEREAL_NVP(m_loading_is_complete), + CEREAL_NVP(m_preloading), + CEREAL_NVP(m_loading_is_complete), CEREAL_NVP(m_explicitly_loading), - CEREAL_NVP(m_owner_map_mb_size), - CEREAL_NVP(m_compacted_sample_size), + CEREAL_NVP(m_owner_map_mb_size), + CEREAL_NVP(m_compacted_sample_size), CEREAL_NVP(m_is_local_cache), - CEREAL_NVP(m_node_sizes_vary), + CEREAL_NVP(m_node_sizes_vary), CEREAL_NVP(m_have_sample_sizes), CEREAL_NVP(m_owner), CEREAL_NVP(m_sample_sizes)); @@ -1662,6 +1718,12 @@ void data_store_conduit::load_checkpoint(std::string dir_name, generic_data_read m_owner, m_sample_sizes); if (reader != nullptr) { +#ifdef LBANN_HAS_DISTCONV + int num_io_parts = dc::get_number_of_io_partitions(); +#else + int num_io_parts = 1; +#endif // LBANN_HAS_DISTCONV + m_reader = reader; m_comm = m_reader->get_comm(); m_shuffled_indices = &(m_reader->get_shuffled_indices()); @@ -1669,8 +1731,11 @@ void data_store_conduit::load_checkpoint(std::string dir_name, generic_data_read m_trainer_master = m_comm->am_trainer_master(); m_rank_in_trainer = m_comm->get_rank_in_trainer(); m_rank_in_world = m_comm->get_rank_in_world(); + m_partition_in_trainer = m_rank_in_trainer/num_io_parts; // needs a better name which group you are in + m_offset_in_partition = m_rank_in_trainer%num_io_parts; m_np_in_trainer = m_comm->get_procs_per_trainer(); - } + m_num_partitions_in_trainer = m_np_in_trainer/num_io_parts; // rename this m_num_io_groups_in_trainer + } // Open metadata filename; this is in index re, checkpointed conduit filenames const std::string metadata_fn = get_metadata_fn(); @@ -1720,7 +1785,7 @@ std::string data_store_conduit::get_conduit_dir() const { } std::string data_store_conduit::get_cereal_fn() const { - return m_spill_dir_base + '/' + m_cereal_fn + "_" + m_reader->get_role() + "_" + std::to_string(m_rank_in_world) + ".xml"; + return m_spill_dir_base + '/' + m_cereal_fn + "_" + m_reader->get_role() + "_" + std::to_string(m_rank_in_world) + ".xml"; } std::string data_store_conduit::get_metadata_fn() const { @@ -1803,14 +1868,14 @@ void data_store_conduit::open_informational_files() { } void data_store_conduit::print_partial_owner_map(int n) { - std::cout << "\nHere is part of the owner map; m_owner.size(): " << m_owner.size() << std::endl; - std::map m; + std::cout << "\nHere is part of the owner map; m_owner.size(): " << m_owner.size() << std::endl; + std::map, int> m; for (auto t : m_owner) { m[t.first] = t.second; } int j = 0; for (auto t : m) { - std::cout << " sample_id: " << t.first << " owner: " << t.second << std::endl; + std::cout << " sample_id: " << t.first.first << ":" << t.first.second << " owner: " << t.second << std::endl; if (j++ >= 10) break; } } @@ -1851,7 +1916,7 @@ void data_store_conduit::test_imagenet_node(int index, bool dereference) { std::cout << "; (>= INT_MAX)\n"; } else { std::cout << std::endl; - } + } conduit::Node nd1; image_reader->load_conduit_node_from_file(data_id, nd1); char *buf1 = nd1[LBANN_DATA_ID_STR(data_id) + "/buffer"].value(); @@ -1870,7 +1935,7 @@ void data_store_conduit::test_imagenet_node(int index, bool dereference) { const conduit::Schema &s = nd2.schema(); s.print(); nd2.print(); - } + } @@ -1940,13 +2005,13 @@ void data_store_conduit::check_query_flags() const { } } -void data_store_conduit::clear_owner_map() { +void data_store_conduit::clear_owner_map() { m_owner_maps_were_exchanged = false; - m_owner.clear(); + m_owner.clear(); } void data_store_conduit::verify_sample_size() { - // Note: m_compacted_sample_size is set during calls to set_conduit_node() or + // Note: m_compacted_sample_size is set during calls to set_conduit_node() or // set_preloaded_conduit_node(). Hence, if these are not called (i.e, the // rank does not own any data), m_compacted_sample_size will be zero. // This method ensures that all ranks know the sample size, whether or not @@ -1979,11 +2044,9 @@ size_t data_store_conduit::get_mem_usage() { LBANN_ERROR("node does not have a valid contiguous data pointer"); } r += nd.total_bytes_compact(); - } + } return r; } } // namespace lbann - - diff --git a/src/io/data_buffers/partitioned_io_buffer.cpp b/src/io/data_buffers/partitioned_io_buffer.cpp index 14495a8463a..5d51747f9ea 100644 --- a/src/io/data_buffers/partitioned_io_buffer.cpp +++ b/src/io/data_buffers/partitioned_io_buffer.cpp @@ -26,6 +26,8 @@ #include "lbann/io/data_buffers/partitioned_io_buffer.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/profiling.hpp" +#include "lbann/utils/distconv.hpp" namespace lbann { @@ -69,6 +71,9 @@ partitioned_io_buffer& partitioned_io_buffer::op template void partitioned_io_buffer::fp_setup_data(El::Int cur_mini_batch_size, int idx) { +#ifdef LBANN_HAS_DISTCONV + cur_mini_batch_size *= dc::get_number_of_io_partitions(); +#endif for (auto& buf : m_data_buffers) { buf.second->m_input_buffers[idx]->Resize(buf.second->m_input_buffers[idx]->Height(), cur_mini_batch_size); } @@ -76,8 +81,25 @@ void partitioned_io_buffer::fp_setup_data(El::Int cur_mini_batch template void partitioned_io_buffer::setup_data(El::Int num_neurons, El::Int num_targets, El::Int max_mini_batch_size) { +#ifdef LBANN_HAS_DISTCONV + if (dc::is_cosmoflow_parallel_io_enabled()) { + num_neurons /= dc::get_number_of_io_partitions(); + // TODO: Make sure that TensorDatType is equivalent to the HDF5 + // data reader's data type (float as default). + // TensorDataType is assumed to be 2-byte integer types such as + // short or int16_t in an older version. + // assert_eq(sizeof(TensorDataType), sizeof(short)); + max_mini_batch_size *= dc::get_number_of_io_partitions(); + } +#endif // LBANN_HAS_DISTCONV El::Int local_mini_batch_size = max_mini_batch_size / this->m_comm->get_procs_per_trainer(); El::Int partial_mini_batch_size = max_mini_batch_size % this->m_comm->get_procs_per_trainer(); +#ifdef LBANN_HAS_DISTCONV + if (dc::is_cosmoflow_parallel_io_enabled()) { + assert_eq(local_mini_batch_size, 1); + assert_eq(partial_mini_batch_size, 0); + } +#endif // LBANN_HAS_DISTCONV if(partial_mini_batch_size > 0 && this->m_comm->get_rank_in_trainer() < partial_mini_batch_size) { local_mini_batch_size++; } @@ -104,6 +126,7 @@ template int partitioned_io_buffer::fetch_to_local_matrix(generic_data_reader *data_reader, execution_mode mode) { int num_parallel_readers = data_reader->get_num_parallel_readers(); + prof_region_begin("fetch_to_local_matrix", prof_colors[2], false); /// Coordinate all available readers so that the perform I/O in the same step /// Check to make sure that the local matrix has space for data data_buffer *buf = get_data_buffer(mode); @@ -121,15 +144,24 @@ int partitioned_io_buffer::fetch_to_local_matrix(generic_data_re // m_num_data_per_epoch+=num_samples_fetched; /// BVE FIXME need to change how this is shared } } + prof_region_end("fetch_to_local_matrix", false); return buf->m_num_samples_fetched; } template void partitioned_io_buffer::distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample, AbsDistMatrixType& response) { + prof_region_begin("distribute_from_local_matrix", prof_colors[3], false); data_buffer *buf = get_data_buffer(mode); Copy(*buf->m_input_buffers[0], sample); Copy(*buf->m_input_buffers[1], response); +#ifdef LBANN_HAS_DISTCONV + if (dc::is_cosmoflow_parallel_io_enabled()) { + response.Resize(response.Height(), response.Width() / + dc::get_number_of_io_partitions()); + } +#endif buf->m_num_samples_fetched = 0; + prof_region_end("distribute_from_local_matrix", false); return; } diff --git a/src/layers/activations/elu.cu b/src/layers/activations/elu.cu index 7999168dd3d..b0bf373916d 100644 --- a/src/layers/activations/elu.cu +++ b/src/layers/activations/elu.cu @@ -96,7 +96,7 @@ void local_fp(TensorDataType alpha, // Launch CUDA kernel if (grid_dim > 0) { - fp_kernel<<>>( + fp_kernel<<>>( alpha, height, width, input.LockedBuffer(), input.LDim(), output.Buffer(), output.LDim()); @@ -125,7 +125,7 @@ void local_bp(TensorDataType alpha, // Launch CUDA kernel if (grid_dim > 0) { - bp_kernel<<>>( + bp_kernel<<>>( alpha, height, width, input.LockedBuffer(), input.LDim(), gradient_wrt_output.LockedBuffer(), gradient_wrt_output.LDim(), diff --git a/src/layers/activations/leaky_relu.cu b/src/layers/activations/leaky_relu.cu index 0a6ed4fd058..02d85c4e747 100644 --- a/src/layers/activations/leaky_relu.cu +++ b/src/layers/activations/leaky_relu.cu @@ -96,7 +96,7 @@ void local_fp(TensorDataType negative_slope, // Launch CUDA kernel if (grid_dim > 0) { - fp_kernel<<>>( + fp_kernel<<>>( negative_slope, height, width, input.LockedBuffer(), input.LDim(), output.Buffer(), output.LDim()); @@ -125,7 +125,7 @@ void local_bp(TensorDataType negative_slope, // Launch CUDA kernel if (grid_dim > 0) { - bp_kernel<<>>( + bp_kernel<<>>( negative_slope, height, width, input.LockedBuffer(), input.LDim(), gradient_wrt_output.LockedBuffer(), gradient_wrt_output.LDim(), diff --git a/src/layers/activations/log_softmax.cu b/src/layers/activations/log_softmax.cu index 8c93bd7a5a9..d4af472b7ca 100644 --- a/src/layers/activations/log_softmax.cu +++ b/src/layers/activations/log_softmax.cu @@ -301,8 +301,8 @@ void fp_compute_impl(log_softmax_layer sync_info{stream, event}; // Find max value in each column @@ -383,8 +383,8 @@ void bp_compute_impl(log_softmax_layer sync_info{stream, event}; // Compute sum of entries in gradient w.r.t. output diff --git a/src/layers/activations/softmax.cu b/src/layers/activations/softmax.cu index 95965f53426..e7b38046d07 100644 --- a/src/layers/activations/softmax.cu +++ b/src/layers/activations/softmax.cu @@ -377,8 +377,8 @@ void fp_compute_impl(softmax_layer sync_info{stream, event}; // Find max value in each column @@ -462,8 +462,8 @@ void bp_compute_impl(softmax_layer sync_info{stream, event}; // Compute dot(y,dy) diff --git a/src/layers/data_type_distconv_adapter.cpp b/src/layers/data_type_distconv_adapter.cpp index e744d505896..a5d20aeee1d 100644 --- a/src/layers/data_type_distconv_adapter.cpp +++ b/src/layers/data_type_distconv_adapter.cpp @@ -28,6 +28,7 @@ #include "lbann/layers/data_type_layer.hpp" #include "lbann/models/model.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" +#include "lbann/trainers/trainer.hpp" namespace lbann { @@ -327,7 +328,7 @@ setup_prev_activations_i(int index) const { const dc::LocaleMPI loc(dc::get_mpi_comm(), false); t = make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(El::GPUManager::Stream()); + t->zero(hydrogen::cuda::GetDefaultStream()); } else { // Create a shallow copy const auto &parent_activations = @@ -345,7 +346,7 @@ dc::Shape data_type_distconv_adapter::get_prev_activations_shape int input_index) const { const auto input_dims = layer().get_input_dims(input_index); std::vector input_tensor_shape_v(input_dims.rbegin(), input_dims.rend()); - input_tensor_shape_v.push_back(layer().get_model()->get_max_mini_batch_size()); + input_tensor_shape_v.push_back(get_max_mini_batch_size()); return dc::Shape(input_tensor_shape_v); } @@ -361,7 +362,7 @@ dc::Shape data_type_distconv_adapter::get_activations_shape( int output_index) const { const auto output_dims = layer().get_output_dims(output_index); std::vector output_tensor_shape_v(output_dims.rbegin(), output_dims.rend()); - output_tensor_shape_v.push_back(layer().get_model()->get_max_mini_batch_size()); + output_tensor_shape_v.push_back(get_max_mini_batch_size()); return dc::Shape(output_tensor_shape_v); } @@ -421,7 +422,7 @@ setup_activations_i(int index) const { const auto local_shape = get_activations_local_shape(index); auto t = make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(El::GPUManager::Stream()); + t->zero(hydrogen::cuda::GetDefaultStream()); return t; } @@ -475,7 +476,7 @@ setup_prev_error_signals_i(int index) const { const dc::LocaleMPI loc(dc::get_mpi_comm(), false); t = make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(El::GPUManager::Stream()); + t->zero(hydrogen::cuda::GetDefaultStream()); } else { // Create a shallow copy const auto &child_error_signals = @@ -540,7 +541,7 @@ setup_error_signals_i(int index) const { const auto local_shape = get_error_signals_local_shape(index); auto t = make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(El::GPUManager::Stream()); + t->zero(hydrogen::cuda::GetDefaultStream()); return t; } @@ -700,12 +701,13 @@ dc::TensorShuffler &get_shuffler( const Layer &layer, std::array*, 4> &shufflers, const dc::TensorDev &src, - const dc::TensorDev &dst) { + const dc::TensorDev &dst, + const size_t max_mini_batch_size) { const auto& c = static_cast( layer.get_model()->get_execution_context()); const auto& mini_batch_size = c.get_current_mini_batch_size(); int shuffler_idx = -1; - if (layer.get_model()->get_max_mini_batch_size() == mini_batch_size) { + if (max_mini_batch_size == mini_batch_size) { shuffler_idx = 0; } else { // The last remaining mini-batches for the train, validation, and @@ -727,28 +729,32 @@ template dc::TensorShuffler& data_type_distconv_adapter:: get_prev_activations_shuffler( const dc::TensorDev &src, const dc::TensorDev &dst) { - return get_shuffler(layer(), m_prev_activations_shufflers, src, dst); + return get_shuffler(layer(), m_prev_activations_shufflers, src, dst, + get_max_mini_batch_size()); } template dc::TensorShuffler& data_type_distconv_adapter:: get_activations_shuffler( const dc::TensorDev &src, const dc::TensorDev &dst) { - return get_shuffler(layer(), m_activations_shufflers, src, dst); + return get_shuffler(layer(), m_activations_shufflers, src, dst, + get_max_mini_batch_size()); } template dc::TensorShuffler& data_type_distconv_adapter:: get_prev_error_signals_shuffler( const dc::TensorDev &src, const dc::TensorDev &dst) { - return get_shuffler(layer(), m_prev_error_signals_shufflers, src, dst); + return get_shuffler(layer(), m_prev_error_signals_shufflers, src, dst, + get_max_mini_batch_size()); } template dc::TensorShuffler& data_type_distconv_adapter:: get_error_signals_shuffler( const dc::TensorDev &src, const dc::TensorDev &dst) { - return get_shuffler(layer(), m_error_signals_shufflers, src, dst); + return get_shuffler(layer(), m_error_signals_shufflers, src, dst, + get_max_mini_batch_size()); } template @@ -774,7 +780,7 @@ void data_type_distconv_adapter::ensure_prev_activations() { shuffler.shuffle_forward( get_original_prev_activations().get_const_base_ptr(), get_prev_activations().get_base_ptr(), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); } } @@ -796,7 +802,7 @@ void data_type_distconv_adapter::copy_out_activations() { shuffler.shuffle_forward( get_activations().get_const_base_ptr(), get_original_activations().get_base_ptr(), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); } } @@ -823,7 +829,7 @@ void data_type_distconv_adapter::ensure_prev_error_signals() { shuffler.shuffle_forward( get_original_prev_error_signals(i).get_const_base_ptr(), get_prev_error_signals(i).get_base_ptr(), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); } } @@ -846,7 +852,7 @@ void data_type_distconv_adapter::copy_out_error_signals() { shuffler.shuffle_forward( get_error_signals(i).get_const_base_ptr(), get_original_error_signals(i).get_base_ptr(), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); } } @@ -878,6 +884,11 @@ void data_type_distconv_adapter::dump_original_error_signals() { get_name() + "_error_signals_original"); } +template +size_t data_type_distconv_adapter::get_max_mini_batch_size() const { + return layer().get_model()->get_max_mini_batch_size_distconv(); +} + #define PROTO(T) \ template class data_type_distconv_adapter diff --git a/src/layers/data_type_layer.cpp b/src/layers/data_type_layer.cpp index f2f6ad3c1b0..116bd4e8921 100644 --- a/src/layers/data_type_layer.cpp +++ b/src/layers/data_type_layer.cpp @@ -122,7 +122,7 @@ void data_type_layer::forward_prop() { #if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG) // Synchronize GPUs and check for errors - if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); } + if (using_gpus()) { hydrogen::gpu::SynchronizeDevice(); } #endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG) #ifdef LBANN_HAS_DISTCONV @@ -143,7 +143,7 @@ void data_type_layer::forward_prop() { #if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG) // Synchronize GPUs and check for errors - if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); } + if (using_gpus()) { hydrogen::gpu::SynchronizeDevice(); } #endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG) m_fp_time += get_time() - fp_start; @@ -161,7 +161,7 @@ void data_type_layer::back_prop_impl_() { #if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG) // Synchronize GPUs and check for errors - if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); } + if (using_gpus()) { hydrogen::gpu::SynchronizeDevice(); } #endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG) #ifdef LBANN_HAS_DISTCONV @@ -182,7 +182,7 @@ void data_type_layer::back_prop_impl_() { #if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG) // Synchronize GPUs and check for errors - if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); } + if (using_gpus()) { hydrogen::gpu::SynchronizeDevice(); } #endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG) m_bp_time += get_time() - bp_start; @@ -860,7 +860,7 @@ void data_type_layer::bp_setup_gradient_wrt_inputs( #ifdef LBANN_HAS_DISTCONV template -void data_type_layer::setup_distconv_adapter() { +void data_type_layer::setup_distconv_adapter(const DataReaderMetaData& dr_metadata) { this->get_distconv_adapter_ptr() = make_unique>(*this); } diff --git a/src/layers/image/bilinear_resize.cu b/src/layers/image/bilinear_resize.cu index d755373b67b..b2326d1e59d 100644 --- a/src/layers/image/bilinear_resize.cu +++ b/src/layers/image/bilinear_resize.cu @@ -147,7 +147,7 @@ void bilinear_resize_layer::fp_compute() { // Launch CUDA kernel if (grid_dim > 0) { fp_kernel - <<>>( + <<>>( num_samples, num_channels, input_height, input_width, local_input.LockedBuffer(), local_input.LDim(), diff --git a/src/layers/io/input/input_layer.cpp b/src/layers/io/input/input_layer.cpp index 6efae118ec2..b1ffad6f4d5 100644 --- a/src/layers/io/input/input_layer.cpp +++ b/src/layers/io/input/input_layer.cpp @@ -34,8 +34,9 @@ namespace lbann { template input_distconv_adapter:: -input_distconv_adapter(Layer& layer): data_type_distconv_adapter(layer), - m_shuffle_required(true) { +input_distconv_adapter(Layer& layer, const bool shuffle_required) + : data_type_distconv_adapter(layer), + m_shuffle_required(shuffle_required) { // Input data is only processed when its consumer layer is also // enabled for distconv for (int i = 0; i < layer.get_num_children(); ++i) { @@ -165,7 +166,7 @@ setup_activations_i(int index) const { const auto local_shape = get_activations_local_shape(index); auto t = make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(El::GPUManager::Stream()); + t->zero(hydrogen::cuda::GetDefaultStream()); return t; } } @@ -252,7 +253,7 @@ template ::fp_compute() { auto &l = dynamic_cast&>(this->layer()); - auto stream = El::GPUManager::Stream(); + auto stream = hydrogen::cuda::GetDefaultStream(); // Note that the mini-batch size of the data reader is not // actually the one for the current mini-batch as the mini-batch // index is already updated by fp_compute. @@ -262,8 +263,11 @@ void input_distconv_adapter::fp_comp for (int mat_idx = 0; mat_idx < l.get_num_children(); ++mat_idx) { if (!is_input_processed(mat_idx)) continue; - assert_eq(mb_size * dc::get_number_of_io_partitions(), - l.get_activations(mat_idx).Width()); + // TODO: This is diabled as it raises an error when the HDF5 data + // reader with hyperslab labels is used. Remove this assertion or + // reshape the actiavtion tensor (mat_idx=1). + // assert_eq(mb_size * dc::get_number_of_io_partitions(), + // l.get_activations(mat_idx).Width()); auto &original_tensor = *m_original_host_tensors[mat_idx]; auto &host_tensor = *m_host_tensors[mat_idx]; diff --git a/src/layers/layer.cpp b/src/layers/layer.cpp index c7eb4351843..c409c461e85 100644 --- a/src/layers/layer.cpp +++ b/src/layers/layer.cpp @@ -401,7 +401,7 @@ void Layer::setup(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata) { setup_dims(dr_metadata); setup_matrices(m_comm->get_trainer_grid()); #ifdef LBANN_HAS_DISTCONV - prepare_distconv(); + prepare_distconv(dr_metadata); #endif // LBANN_HAS_DISTCONV setup_data(max_mini_batch_size); if (using_gpus()) { setup_gpu(); } @@ -646,9 +646,9 @@ void Layer::set_layer_pointers(std::vector layers) { } #ifdef LBANN_HAS_DISTCONV -void Layer::prepare_distconv() { +void Layer::prepare_distconv(const DataReaderMetaData& dr_metadata) { if (distconv_enabled()) { - setup_distconv_adapter(); + setup_distconv_adapter(dr_metadata); } } diff --git a/src/layers/learning/CMakeLists.txt b/src/layers/learning/CMakeLists.txt index 3b9207d7f8a..55e19abe0b6 100644 --- a/src/layers/learning/CMakeLists.txt +++ b/src/layers/learning/CMakeLists.txt @@ -10,6 +10,7 @@ set_full_path(THIS_DIR_SOURCES embedding.cpp embedding_builder.cpp fully_connected.cpp + gru.cpp ) if (LBANN_HAS_CUDA) diff --git a/src/layers/learning/base_convolution.cpp b/src/layers/learning/base_convolution.cpp index 68c3d42cbf7..3faa3f5817d 100644 --- a/src/layers/learning/base_convolution.cpp +++ b/src/layers/learning/base_convolution.cpp @@ -1215,7 +1215,8 @@ base_convolution_layer::get_backward_filter_algo_cudnn( #ifdef LBANN_HAS_DISTCONV template -void base_convolution_layer::setup_distconv_adapter() { +void base_convolution_layer::setup_distconv_adapter( + const DataReaderMetaData& dr_metadata) { this->get_distconv_adapter_ptr() = make_unique< base_convolution_adapter>(*this); } @@ -1254,15 +1255,11 @@ void base_convolution_adapter::setup_fp_tensors() { std::reverse(kernel_shape.begin(), kernel_shape.end()); const dc::LocaleMPI loc(dc::get_mpi_comm(), false); m_kernel = make_unique(kernel_shape, loc, shared_dist); - assert0(dc::tensor::View( - *m_kernel, layer.weights_values(0).LockedBuffer())); if (layer.m_bias_scaling_factor != TensorDataType(0)) { dc::Shape bias_shape(dc::get_num_dims(layer), 1); bias_shape[dc::get_channel_dim()] = layer.get_output_dims()[0]; m_bias = make_unique(bias_shape, loc, shared_dist); - assert0(dc::tensor::View( - *m_bias, layer.weights_values(1).LockedBuffer())); } } @@ -1280,13 +1277,14 @@ void base_convolution_adapter::setup_bp_tensors() { m_kernel_gradient = make_unique(kernel_shape, loc, shared_dist); // Gradient buffer is needed for auto-tuning the bp filter algorithm + auto* kernel_optimizer = static_cast*>(l.get_weights(0).get_optimizer()); assert0(dc::tensor::View( *m_kernel_gradient, - l.get_weights(0).get_optimizer()->get_gradient().Buffer())); + kernel_optimizer->get_gradient().Buffer())); // Bias tensor. Shared by all procs if (l.m_bias_scaling_factor != TensorDataType(0)) { - auto* bias_optimizer = l.get_weights(1).get_optimizer(); + auto* bias_optimizer = static_cast*>(l.get_weights(1).get_optimizer()); if (bias_optimizer != nullptr) { dc::Shape bias_shape(dc::get_num_dims(l), 1); bias_shape[dc::get_channel_dim()] = l.get_output_dims()[0]; @@ -1295,7 +1293,7 @@ void base_convolution_adapter::setup_bp_tensors() { // which is set when its view is set. assert0(dc::tensor::View( *m_bias_gradient, - l.get_weights(1).get_optimizer()->get_gradient().Buffer())); + bias_optimizer->get_gradient().Buffer())); } } } @@ -1365,7 +1363,7 @@ void base_convolution_adapter::bp_compute_convolution_fi this->get_prev_error_signals(), dst_scale, *m_bias_gradient, false); } else { - m_bias_gradient->scale(dst_scale, El::GPUManager::Stream()); + m_bias_gradient->scale(dst_scale, hydrogen::cuda::GetDefaultStream()); } } @@ -1383,9 +1381,17 @@ void base_convolution_adapter::bp_compute_convolution_fi dst_scale, *m_kernel_gradient, false); } else { - m_kernel_gradient->scale(dst_scale, El::GPUManager::Stream()); + m_kernel_gradient->scale(dst_scale, hydrogen::cuda::GetDefaultStream()); } } + + +#define PROTO_DEVICE(T, Device) \ + template class base_convolution_adapter + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + #endif // LBANN_HAS_DISTCONV #define PROTO_DEVICE(T, Device) \ diff --git a/src/layers/learning/channelwise_scale_bias.cu b/src/layers/learning/channelwise_scale_bias.cu index e603a36f5b2..54e3b07bc57 100644 --- a/src/layers/learning/channelwise_scale_bias.cu +++ b/src/layers/learning/channelwise_scale_bias.cu @@ -204,7 +204,7 @@ void channelwise_scale_bias_layer::fp_compute() { grid_dims.y = (local_width + block_size_y - 1) / block_size_y; grid_dims.z = num_channels; fp_kernel - <<>>( + <<>>( num_channels, channel_size, local_width, local_input.LockedBuffer(), local_input.LDim(), local_output.Buffer(), local_output.LDim(), @@ -254,7 +254,7 @@ void channelwise_scale_bias_layer::bp_compute() { grid_dims.y = (local_width + block_size_y - 1) / block_size_y; grid_dims.z = num_channels; bp_kernel - <<>>( + <<>>( num_channels, channel_size, local_width, local_input.LockedBuffer(), local_input.LDim(), local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), diff --git a/src/layers/learning/convolution.cpp b/src/layers/learning/convolution.cpp index e9ea59a2b58..ea7723cafbc 100644 --- a/src/layers/learning/convolution.cpp +++ b/src/layers/learning/convolution.cpp @@ -162,7 +162,8 @@ void convolution_layer::bp_compute() { #if defined LBANN_HAS_DISTCONV template -void convolution_layer::setup_distconv_adapter() { +void convolution_layer::setup_distconv_adapter( + const DataReaderMetaData& dr_metadata) { this->get_distconv_adapter_ptr() = make_unique< convolution_distconv_adapter>(*this); } diff --git a/src/layers/learning/deconvolution.cpp b/src/layers/learning/deconvolution.cpp index 3f80353e369..79c1e3e1e58 100644 --- a/src/layers/learning/deconvolution.cpp +++ b/src/layers/learning/deconvolution.cpp @@ -185,7 +185,7 @@ void deconvolution_layer::bp_compute() { #if defined LBANN_HAS_DISTCONV template void deconvolution_layer -::setup_distconv_adapter() { +::setup_distconv_adapter(const DataReaderMetaData& dr_metadata) { this->get_distconv_adapter_ptr() = make_unique< deconvolution_distconv_adapter>(*this); } diff --git a/src/layers/learning/embedding.cu b/src/layers/learning/embedding.cu index 29e002be307..df780404ded 100644 --- a/src/layers/learning/embedding.cu +++ b/src/layers/learning/embedding.cu @@ -135,7 +135,7 @@ void embedding_layer::fp_compute() { grid_dims.x = (this->m_embedding_dim + block_size - 1) / block_size; grid_dims.y = input_size; grid_dims.z = local_mini_batch_size; - fp_kernel<<>>( + fp_kernel<<>>( this->m_num_embeddings, this->m_embedding_dim, input_size, @@ -177,7 +177,7 @@ void embedding_layer::bp_compute() { grid_dims.x = (this->m_embedding_dim + block_size - 1) / block_size; grid_dims.y = input_size; grid_dims.z = local_mini_batch_size; - bp_kernel<<>>( + bp_kernel<<>>( this->m_num_embeddings, this->m_embedding_dim, input_size, diff --git a/src/layers/learning/entrywise_scale_bias.cu b/src/layers/learning/entrywise_scale_bias.cu index 695986244cc..d16dd3b3857 100644 --- a/src/layers/learning/entrywise_scale_bias.cu +++ b/src/layers/learning/entrywise_scale_bias.cu @@ -118,7 +118,7 @@ void fp_impl( block_dims.y = block_size_y; grid_dims.x = (local_height + block_size_x - 1) / block_size_x; grid_dims.y = (local_width + block_size_y - 1) / block_size_y; - fp_kernel<<>>( + fp_kernel<<>>( local_height, local_width, local_input.LockedBuffer(), local_input.LDim(), local_output.Buffer(), local_output.LDim(), @@ -153,7 +153,7 @@ void bp_impl( dim3 block_dims, grid_dims; block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; - bp_kernel <<>>( + bp_kernel <<>>( local_height, local_width, local_input.LockedBuffer(), local_input.LDim(), local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), diff --git a/src/layers/learning/gru.cpp b/src/layers/learning/gru.cpp new file mode 100644 index 00000000000..4227b6723f2 --- /dev/null +++ b/src/layers/learning/gru.cpp @@ -0,0 +1,936 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_GRU_LAYER_INSTANTIATE +#include "lbann/layers/learning/gru.hpp" +#include "lbann/models/model.hpp" +#include "lbann/weights/initializer.hpp" +#include "lbann/proto/proto_common.hpp" +#include + +namespace lbann { + +// --------------------------------------------- +// Life cycle +// --------------------------------------------- + +template +gru_layer::gru_layer(lbann_comm* comm, size_t hidden_size) + : data_type_layer(comm), + m_hidden_size{hidden_size} { + this->m_expected_num_parent_layers = 2; +} + +template +gru_layer::gru_layer(const gru_layer& other) + : data_type_layer(other), + m_hidden_size{other.m_hidden_size} +#ifdef LBANN_HAS_CUDNN + , m_rnn_cudnn_desc{other.m_rnn_cudnn_desc}, + m_input_cudnn_desc{other.m_input_cudnn_desc}, + m_output_cudnn_desc{other.m_output_cudnn_desc}, + m_hidden_cudnn_desc{other.m_hidden_cudnn_desc}, + m_packed_weights_cudnn_desc{other.m_packed_weights_cudnn_desc} +#endif // LBANN_HAS_CUDNN +{ +#ifdef LBANN_HAS_CUDNN + /// @todo Copy m_cudnn_reserve_space? +#endif // LBANN_HAS_CUDNN +} + +template +gru_layer& gru_layer +::operator=(const gru_layer& other) { + data_type_layer::operator=(other); + m_hidden_size = other.m_hidden_size; +#ifdef LBANN_HAS_CUDNN + m_rnn_cudnn_desc = other.m_rnn_cudnn_desc; + m_input_cudnn_desc = other.m_input_cudnn_desc; + m_output_cudnn_desc = other.m_output_cudnn_desc; + m_hidden_cudnn_desc = other.m_hidden_cudnn_desc; + m_packed_weights_cudnn_desc = other.m_packed_weights_cudnn_desc; + /// @todo Copy m_cudnn_reserve_space? +#endif // LBANN_HAS_CUDNN + return *this; +} + +template +gru_layer* +gru_layer +::copy() const +{ + return new gru_layer(*this); +} + +// --------------------------------------------- +// Query functions +// --------------------------------------------- + +template +std::string +gru_layer +::get_type() const +{ + return "GRU"; +} + +template +data_layout +gru_layer +::get_data_layout() const +{ + return Layout; +} + +template +El::Device +gru_layer +::get_device_allocation() const +{ + return Device; +} + +template +description +gru_layer +::get_description() const +{ + auto desc = data_type_layer::get_description(); + desc.add("Hidden size", m_hidden_size); + return desc; +} + +// --------------------------------------------- +// Setup +// --------------------------------------------- + +template +void gru_layer::setup_dims(DataReaderMetaData& dr_metadata) { + data_type_layer::setup_dims(dr_metadata); + const int sequence_length = this->get_input_dims(0)[0]; + if (static_cast(this->get_input_size(1)) != m_hidden_size) { + LBANN_ERROR( + this->get_type()," layer \"",this->get_name(),"\" ", + "has an invalid input tensor for the initial hidden state"); + } + const std::vector output_dims = {sequence_length, static_cast(m_hidden_size)}; + this->set_output_dims(output_dims); +} + +template +void gru_layer +::setup_data(size_t max_mini_batch_size) { + data_type_layer::setup_data(max_mini_batch_size); + + const size_t sequence_length = this->get_input_dims()[0]; + const size_t input_size = this->get_input_size(0) / sequence_length; + + // Construct default weights if needed + if (!this->has_weights()) { + const std::vector weight_names + = {"ih_matrix", "hh_matrix", "ih_bias", "hh_bias"}; + this->set_num_weights(4); + const auto scale = El::To(1./std::sqrt(m_hidden_size)); + for (size_t i=0; i<4; ++i) { + auto w = make_unique>(this->get_comm()); + auto init = make_unique>(-scale, scale); + auto opt = this->m_model->template create_optimizer(); + w->set_name(this->get_name() + "_" + weight_names[i]); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->set_weights(i, w.get()); + this->m_model->add_weights(std::move(w)); + } + } + if (this->num_weights() != 4) { + LBANN_ERROR( + "attempted to setup ", + this->get_type()," layer \"",this->get_name(),"\" ", + "with an invalid number of weights ", + "(expected 4, found ",this->num_weights(),")"); + } + + // Setup weight dimensions and distribution + auto& ih_matrix = this->get_weights(0); + auto& hh_matrix = this->get_weights(1); + auto& ih_bias = this->get_weights(2); + auto& hh_bias = this->get_weights(3); + ih_matrix.set_dims({static_cast(3*m_hidden_size)}, {static_cast(input_size)}); + hh_matrix.set_dims({static_cast(3*m_hidden_size)}, {static_cast(m_hidden_size)}); + ih_bias.set_dims({static_cast(3*m_hidden_size)}); + hh_bias.set_dims({static_cast(3*m_hidden_size)}); + auto dist = this->get_prev_activations().DistData(); + dist.colDist = El::STAR; + dist.rowDist = El::STAR; + ih_matrix.set_matrix_distribution(dist); + hh_matrix.set_matrix_distribution(dist); + ih_bias.set_matrix_distribution(dist); + hh_bias.set_matrix_distribution(dist); + +} + +#ifdef LBANN_HAS_CUDNN +template +void gru_layer::setup_gpu() { + + // Dimensions + const size_t sequence_length = this->get_input_dims(0)[0]; + const size_t input_size = this->get_input_size(0) / sequence_length; + + // GPU objects + auto&& handle = cudnn::get_handle(); + auto data_type = cudnn::get_data_type(); + + // RNN descriptor + size_t dropout_state_size; + CHECK_CUDNN(cudnnDropoutGetStatesSize(handle, &dropout_state_size)); + cudnn::DropoutDescriptor dropout_desc(0.f, nullptr, dropout_state_size, 0); + m_rnn_cudnn_desc.set( + m_hidden_size, + 1, // num_layers + dropout_desc, + CUDNN_LINEAR_INPUT, + CUDNN_UNIDIRECTIONAL, + CUDNN_GRU, + CUDNN_RNN_ALGO_STANDARD, + data_type); + CHECK_CUDNN( + cudnnSetRNNMatrixMathType( + m_rnn_cudnn_desc, + cudnn::get_default_convolution_math_type())); + + // Input and output tensor descriptors + m_input_cudnn_desc.set(data_type, 1, input_size, 1); + m_output_cudnn_desc.set(data_type, 1, m_hidden_size, 1); + m_hidden_cudnn_desc.set(data_type, 1, 1, m_hidden_size); + + // Packed weights descriptor + size_t weights_size; + CHECK_CUDNN( + cudnnGetRNNParamsSize( + handle, + m_rnn_cudnn_desc, + m_input_cudnn_desc, + &weights_size, + data_type)); + m_packed_weights_cudnn_desc.set( + data_type, + CUDNN_TENSOR_NCHW, + weights_size / sizeof(TensorDataType), + 1, + 1); + +} +#endif // LBANN_HAS_CUDNN + +// --------------------------------------------- +// Forward prop +// --------------------------------------------- + +template +void gru_layer::fp_compute() { + fp_compute_impl(*this); +} + +namespace { +#ifdef LBANN_HAS_CUDNN +template +hydrogen::simple_buffer pack_cudnn_rnn_weights( + const cudnnHandle_t& handle, + const cudnn::RNNDescriptor& rnn_desc, + const cudnn::TensorDescriptor& input_desc, + const cudnn::FilterDescriptor& weights_desc, + const El::SyncInfo& sync_info, + size_t input_size, + size_t hidden_size, + const El::Matrix& ih_matrix, + const El::Matrix& hh_matrix, + const El::Matrix& ih_bias, + const El::Matrix& hh_bias) { + + // Allocate buffer for packed weights + size_t packed_weights_size; + CHECK_CUDNN( + cudnnGetRNNParamsSize( + handle, + rnn_desc, + input_desc, + &packed_weights_size, + cudnn::get_data_type())); + hydrogen::simple_buffer packed_weights(packed_weights_size, sync_info); + + // Construct objects + static cudnn::FilterDescriptor result_weights_desc; + result_weights_desc.create(); + El::Matrix packed_weights_view; + packed_weights_view.SetSyncInfo(sync_info); + + // Functions to get pointers in packed weights buffer + auto get_matrix_ptr = [&] (size_t id) -> TensorDataType* { + TensorDataType* ptr; + CHECK_CUDNN( + cudnnGetRNNLinLayerMatrixParams( + handle, + rnn_desc, + 0, // pseudoLayer + input_desc, + weights_desc, + packed_weights.data(), + id, // linLayerID + result_weights_desc, + reinterpret_cast(&ptr))); + return ptr; + }; + auto get_bias_ptr = [&] (size_t id) -> TensorDataType* { + TensorDataType* ptr; + CHECK_CUDNN( + cudnnGetRNNLinLayerBiasParams( + handle, + rnn_desc, + 0, // pseudoLayer + input_desc, + weights_desc, + packed_weights.data(), + id, // linLayerID + result_weights_desc, + reinterpret_cast(&ptr))); + return ptr; + }; + + // Copy from ih_matrix + for (auto i : {0, 1, 2}) { + packed_weights_view.Attach( + input_size, + hidden_size, + get_matrix_ptr(i), + input_size); + El::Transpose( + ih_matrix(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL), + packed_weights_view, + false); + } + + // Copy from hh_matrix + for (auto i : {0, 1, 2}) { + packed_weights_view.Attach( + hidden_size, + hidden_size, + get_matrix_ptr(3+i), + hidden_size); + El::Transpose( + hh_matrix(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL), + packed_weights_view, + false); + } + + // Copy from ih_bias + for (auto i : {0, 1, 2}) { + packed_weights_view.Attach( + hidden_size, + 1, + get_bias_ptr(i), + hidden_size); + El::Copy( + ih_bias(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL), + packed_weights_view); + } + + // Copy from hh_bias + for (auto i : {0, 1, 2}) { + packed_weights_view.Attach( + hidden_size, + 1, + get_bias_ptr(3+i), + hidden_size); + El::Copy( + hh_bias(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL), + packed_weights_view); + } + + return packed_weights; +} +#endif // LBANN_HAS_CUDNN +} // namespace + +#ifdef LBANN_HAS_CUDNN +template +void fp_compute_impl( + gru_layer& l) { + using LocalMat = El::Matrix; + using ByteBuffer = hydrogen::simple_buffer; + + // Matrices + const auto& input_sequence + = dynamic_cast(l.get_local_prev_activations(0)); + const auto& init_hidden + = dynamic_cast(l.get_local_prev_activations(1)); + auto& output_sequence + = dynamic_cast(l.get_local_activations()); + const auto& ih_matrix + = dynamic_cast(l.weights_values(0).LockedMatrix()); + const auto& hh_matrix + = dynamic_cast(l.weights_values(1).LockedMatrix()); + const auto& ih_bias + = dynamic_cast(l.weights_values(2).LockedMatrix()); + const auto& hh_bias + = dynamic_cast(l.weights_values(3).LockedMatrix()); + + // Dimensions + const size_t sequence_length = l.get_input_dims(0)[0]; + const size_t mini_batch_size = input_sequence.Width(); + const size_t input_size = l.get_input_size(0) / sequence_length; + const size_t hidden_size = l.m_hidden_size; + + // Return immediately if there is no local data + if (mini_batch_size <= 0) { + return; + } + + // GPU objects + auto&& sync_info = input_sequence.GetSyncInfo(); + auto&& stream = sync_info.Stream(); + auto&& handle = cudnn::get_handle(); + const auto data_type = cudnn::get_data_type(); + + // Configure input and output tensor descriptors + auto& input_desc = l.m_input_cudnn_desc; + auto& output_desc = l.m_output_cudnn_desc; + auto& hidden_desc = l.m_hidden_cudnn_desc; + input_desc.set(data_type, mini_batch_size, input_size, 1); + output_desc.set(data_type, mini_batch_size, hidden_size, 1); + hidden_desc.set(data_type, 1, mini_batch_size, hidden_size); + std::vector + input_desc_list(sequence_length, input_desc), + output_desc_list(sequence_length, output_desc); + + // Reorder input tensor dims + // Note: cuDNN uses sequence_length x mini_batch_size x hidden_size + LocalMat input_sequence_workspace, output_sequence_workspace; + input_sequence_workspace.SetSyncInfo(sync_info); + output_sequence_workspace.SetSyncInfo(sync_info); + input_sequence_workspace.Resize(mini_batch_size*input_size, sequence_length); + output_sequence_workspace.Resize(mini_batch_size*hidden_size, sequence_length); + constexpr size_t one{1}; + cuda::copy_tensor( + stream, + {mini_batch_size, sequence_length, input_size}, + input_sequence.LockedBuffer(), + {static_cast(input_sequence.LDim()), input_size, one}, + input_sequence_workspace.Buffer(), + {input_size, mini_batch_size*input_size, one}); + + // Pack weights into workspace buffer + auto packed_weights = pack_cudnn_rnn_weights( + handle, + l.m_rnn_cudnn_desc, + input_desc, + l.m_packed_weights_cudnn_desc, + sync_info, + input_size, + hidden_size, + ih_matrix, + hh_matrix, + ih_bias, + hh_bias); + + // Allocate cuDNN workspace buffers + /// @todo Handle synchronization for m_cudnn_reserve_space + size_t cudnn_workspace_size, cudnn_reserve_space_size; + CHECK_CUDNN( + cudnnGetRNNWorkspaceSize( + handle, + l.m_rnn_cudnn_desc, + sequence_length, + input_desc_list.data(), + &cudnn_workspace_size)); + CHECK_CUDNN( + cudnnGetRNNTrainingReserveSize( + handle, + l.m_rnn_cudnn_desc, + sequence_length, + input_desc_list.data(), + &cudnn_reserve_space_size)); + ByteBuffer cudnn_workspace(cudnn_workspace_size, sync_info); + l.m_cudnn_reserve_space.allocate(cudnn_reserve_space_size); + + // Launch cuDNN GRU + CHECK_CUDNN( + cudnnRNNForwardTraining( + handle, + l.m_rnn_cudnn_desc, + sequence_length, + input_desc_list.data(), + input_sequence_workspace.LockedBuffer(), + hidden_desc, + init_hidden.LockedBuffer(), + hidden_desc, // cxDesc + nullptr, // cx + l.m_packed_weights_cudnn_desc, + packed_weights.data(), + output_desc_list.data(), + output_sequence_workspace.Buffer(), + hidden_desc, // hyDesc + nullptr, // hy + hidden_desc, // cyDesc + nullptr, // cy + cudnn_workspace.data(), + cudnn_workspace.size(), + l.m_cudnn_reserve_space.data(), + l.m_cudnn_reserve_space.size())); + + // Reorder output tensor dims + // Note: cuDNN uses sequence_length x mini_batch_size x hidden_size + cuda::copy_tensor( + stream, + {mini_batch_size, sequence_length, hidden_size}, + output_sequence_workspace.LockedBuffer(), + {hidden_size, mini_batch_size*hidden_size, one}, + output_sequence.Buffer(), + {static_cast(output_sequence.LDim()), hidden_size, one}); + +} +#endif // LBANN_HAS_CUDNN + +// --------------------------------------------- +// Back prop +// --------------------------------------------- + +template +void gru_layer::bp_compute() { + bp_compute_impl(*this); +} + +namespace { +#ifdef LBANN_HAS_CUDNN +template +void unpack_cudnn_rnn_weights( + const cudnnHandle_t& handle, + const cudnn::RNNDescriptor& rnn_desc, + const cudnn::TensorDescriptor& input_desc, + const cudnn::FilterDescriptor& weights_desc, + const El::SyncInfo& sync_info, + size_t input_size, + size_t hidden_size, + const TensorDataType* packed_weights_buffer, + El::Matrix& ih_matrix, + El::Matrix& hh_matrix, + El::Matrix& ih_bias, + El::Matrix& hh_bias) { + + // Construct objects + static cudnn::FilterDescriptor result_weights_desc; + result_weights_desc.create(); + El::Matrix packed_weights_view; + packed_weights_view.SetSyncInfo(sync_info); + + // Functions to get pointers in packed weights buffer + auto get_matrix_ptr = [&] (size_t id) -> const TensorDataType* { + TensorDataType* ptr; + CHECK_CUDNN( + cudnnGetRNNLinLayerMatrixParams( + handle, + rnn_desc, + 0, // pseudoLayer + input_desc, + weights_desc, + const_cast(reinterpret_cast(packed_weights_buffer)), + id, // linLayerID + result_weights_desc, + reinterpret_cast(&ptr))); + return ptr; + }; + auto get_bias_ptr = [&] (size_t id) -> const TensorDataType* { + TensorDataType* ptr; + CHECK_CUDNN( + cudnnGetRNNLinLayerBiasParams( + handle, + rnn_desc, + 0, // pseudoLayer + input_desc, + weights_desc, + const_cast(reinterpret_cast(packed_weights_buffer)), + id, // linLayerID + result_weights_desc, + reinterpret_cast(&ptr))); + return ptr; + }; + + // Copy from ih_matrix + for (auto i : {0, 1, 2}) { + packed_weights_view.LockedAttach( + input_size, + hidden_size, + get_matrix_ptr(i), + input_size); + auto ih_matrix_view = ih_matrix(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL); + El::Transpose(packed_weights_view, ih_matrix_view, false); + } + + // Copy from hh_matrix + for (auto i : {0, 1, 2}) { + packed_weights_view.LockedAttach( + hidden_size, + hidden_size, + get_matrix_ptr(3+i), + hidden_size); + auto hh_matrix_view = hh_matrix(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL); + El::Transpose(packed_weights_view, hh_matrix_view, false); + } + + // Copy from ih_bias + for (auto i : {0, 1, 2}) { + packed_weights_view.LockedAttach( + hidden_size, + 1, + get_bias_ptr(i), + hidden_size); + auto ih_bias_view = ih_bias(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL); + El::Copy(packed_weights_view, ih_bias_view); + } + + // Copy from hh_bias + for (auto i : {0, 1, 2}) { + packed_weights_view.LockedAttach( + hidden_size, + 1, + get_bias_ptr(3+i), + hidden_size); + auto hh_bias_view = hh_bias(El::IR(i*hidden_size, (i+1)*hidden_size), El::ALL); + El::Copy(packed_weights_view, hh_bias_view); + } + +} +#endif // LBANN_HAS_CUDNN +} // namespace + +#ifdef LBANN_HAS_CUDNN +template +void bp_compute_impl( + gru_layer& l) { + using LocalMat = El::Matrix; + using ByteBuffer = hydrogen::simple_buffer; + + // Matrices + const auto& input_sequence + = dynamic_cast(l.get_local_prev_activations(0)); + const auto& init_hidden + = dynamic_cast(l.get_local_prev_activations(1)); + const auto& output_sequence + = dynamic_cast(l.get_local_activations()); + const auto& output_sequence_grad + = dynamic_cast(l.get_local_prev_error_signals()); + auto& input_sequence_grad + = dynamic_cast(l.get_local_error_signals(0)); + auto& init_hidden_grad + = dynamic_cast(l.get_local_error_signals(1)); + const auto& ih_matrix + = dynamic_cast(l.weights_values(0).LockedMatrix()); + const auto& hh_matrix + = dynamic_cast(l.weights_values(1).LockedMatrix()); + const auto& ih_bias + = dynamic_cast(l.weights_values(2).LockedMatrix()); + const auto& hh_bias + = dynamic_cast(l.weights_values(3).LockedMatrix()); + + // Dimensions + const size_t sequence_length = l.get_input_dims(0)[0]; + const size_t mini_batch_size = input_sequence.Width(); + const size_t input_size = l.get_input_size(0) / sequence_length; + const size_t hidden_size = l.m_hidden_size; + + // GPU objects + auto&& sync_info = input_sequence.GetSyncInfo(); + auto&& stream = sync_info.Stream(); + auto&& handle = cudnn::get_handle(); + + // Define closure to send weight gradients to optimizers + LocalMat ih_matrix_grad, hh_matrix_grad, ih_bias_grad, hh_bias_grad; + ih_matrix_grad.SetSyncInfo(sync_info); + hh_matrix_grad.SetSyncInfo(sync_info); + ih_bias_grad.SetSyncInfo(sync_info); + hh_bias_grad.SetSyncInfo(sync_info); + ih_matrix_grad.Resize(3*hidden_size, input_size); + hh_matrix_grad.Resize(3*hidden_size, hidden_size); + ih_bias_grad.Resize(3*hidden_size, 1); + hh_bias_grad.Resize(3*hidden_size, 1); + auto send_weight_grads_to_optimizers = [&] () { + TensorDataType buf_scale, in_scale; + auto&& ih_matrix_opt = l.get_weights(0).get_optimizer(); + auto&& hh_matrix_opt = l.get_weights(1).get_optimizer(); + auto&& ih_bias_opt = l.get_weights(2).get_optimizer(); + auto&& hh_bias_opt = l.get_weights(3).get_optimizer(); + if (ih_matrix_opt != nullptr) { + auto& buf = ih_matrix_opt->get_gradient_buffer(buf_scale, in_scale, true); + El::Scale(buf_scale, buf); + El::Axpy(in_scale, ih_matrix_grad, buf.Matrix()); + } + if (hh_matrix_opt != nullptr) { + auto& buf = hh_matrix_opt->get_gradient_buffer(buf_scale, in_scale, true); + El::Scale(buf_scale, buf); + El::Axpy(in_scale, hh_matrix_grad, buf.Matrix()); + } + if (ih_bias_opt != nullptr) { + auto& buf = ih_bias_opt->get_gradient_buffer(buf_scale, in_scale, true); + El::Scale(buf_scale, buf); + El::Axpy(in_scale, ih_bias_grad, buf.Matrix()); + } + if (hh_bias_opt != nullptr) { + auto& buf = hh_bias_opt->get_gradient_buffer(buf_scale, in_scale, true); + El::Scale(buf_scale, buf); + El::Axpy(in_scale, hh_bias_grad, buf.Matrix()); + } + }; + + // Return immediately if there is no local data + if (mini_batch_size <= 0) { + El::Zero(ih_matrix_grad); + El::Zero(hh_matrix_grad); + El::Zero(ih_bias_grad); + El::Zero(hh_bias_grad); + send_weight_grads_to_optimizers(); + } + + // Configure input and output tensor descriptors + // Note: Descriptor dims have already been set in forward prop + auto& input_desc = l.m_input_cudnn_desc; + auto& output_desc = l.m_output_cudnn_desc; + auto& hidden_desc = l.m_hidden_cudnn_desc; + std::vector + input_desc_list(sequence_length, input_desc), + output_desc_list(sequence_length, output_desc); + + // Reorder tensor dims + // Note: cuDNN uses sequence_length x mini_batch_size x size + LocalMat input_sequence_workspace, output_sequence_workspace; + LocalMat input_sequence_grad_workspace, output_sequence_grad_workspace; + input_sequence_workspace.SetSyncInfo(sync_info); + output_sequence_workspace.SetSyncInfo(sync_info); + input_sequence_grad_workspace.SetSyncInfo(sync_info); + output_sequence_grad_workspace.SetSyncInfo(sync_info); + input_sequence_workspace.Resize(mini_batch_size*input_size, sequence_length); + output_sequence_workspace.Resize(mini_batch_size*hidden_size, sequence_length); + input_sequence_grad_workspace.Resize(mini_batch_size*input_size, sequence_length); + output_sequence_grad_workspace.Resize(mini_batch_size*hidden_size, sequence_length); + constexpr size_t one{1}; + cuda::copy_tensor( + stream, + {mini_batch_size, sequence_length, input_size}, + input_sequence.LockedBuffer(), + {sequence_length*input_size, input_size, one}, + input_sequence_workspace.Buffer(), + {input_size, mini_batch_size*input_size, one}); + cuda::copy_tensor( + stream, + {mini_batch_size, sequence_length, hidden_size}, + output_sequence.LockedBuffer(), + {sequence_length*hidden_size, hidden_size, one}, + output_sequence_workspace.Buffer(), + {hidden_size, mini_batch_size*hidden_size, one}); + cuda::copy_tensor( + stream, + {mini_batch_size, sequence_length, hidden_size}, + output_sequence_grad.LockedBuffer(), + {sequence_length*hidden_size, hidden_size, one}, + output_sequence_grad_workspace.Buffer(), + {hidden_size, mini_batch_size*hidden_size, one}); + + // Pack weights into workspace buffer + auto packed_weights = pack_cudnn_rnn_weights( + handle, + l.m_rnn_cudnn_desc, + input_desc, + l.m_packed_weights_cudnn_desc, + sync_info, + input_size, + hidden_size, + ih_matrix, + hh_matrix, + ih_bias, + hh_bias); + LocalMat weights_grad_workspace; + weights_grad_workspace.SetSyncInfo(sync_info); + El::Zeros( + weights_grad_workspace, + packed_weights.size() / sizeof(TensorDataType), + 1); + + // Allocate cuDNN workspace buffers + size_t cudnn_workspace_size; + CHECK_CUDNN( + cudnnGetRNNWorkspaceSize( + handle, + l.m_rnn_cudnn_desc, + sequence_length, + input_desc_list.data(), + &cudnn_workspace_size)); + ByteBuffer cudnn_workspace(cudnn_workspace_size, sync_info); + + // Launch cuDNN GRU backprop + CHECK_CUDNN( + cudnnRNNBackwardData( + handle, + l.m_rnn_cudnn_desc, + sequence_length, + output_desc_list.data(), + output_sequence_workspace.LockedBuffer(), + output_desc_list.data(), + output_sequence_grad_workspace.LockedBuffer(), + hidden_desc, // dhyDesc + nullptr, + hidden_desc, // dcyDesc + nullptr, + l.m_packed_weights_cudnn_desc, + packed_weights.data(), + hidden_desc, + init_hidden.LockedBuffer(), + hidden_desc, // cxDesc + nullptr, + input_desc_list.data(), + input_sequence_grad_workspace.Buffer(), + hidden_desc, + init_hidden_grad.Buffer(), + hidden_desc, // dcxDesc + nullptr, + cudnn_workspace.data(), + cudnn_workspace.size(), + l.m_cudnn_reserve_space.data(), + l.m_cudnn_reserve_space.size())); + CHECK_CUDNN( + cudnnRNNBackwardWeights( + handle, + l.m_rnn_cudnn_desc, + sequence_length, + input_desc_list.data(), + input_sequence_workspace.LockedBuffer(), + hidden_desc, + init_hidden.LockedBuffer(), + output_desc_list.data(), + output_sequence_workspace.LockedBuffer(), + cudnn_workspace.data(), + cudnn_workspace.size(), + l.m_packed_weights_cudnn_desc, + weights_grad_workspace.Buffer(), + l.m_cudnn_reserve_space.data(), + l.m_cudnn_reserve_space.size())); + + // Send gradients to optimizers + unpack_cudnn_rnn_weights( + handle, + l.m_rnn_cudnn_desc, + input_desc, + l.m_packed_weights_cudnn_desc, + sync_info, + input_size, + hidden_size, + weights_grad_workspace.LockedBuffer(), + ih_matrix_grad, + hh_matrix_grad, + ih_bias_grad, + hh_bias_grad); + send_weight_grads_to_optimizers(); + + // Reorder input grad tensor dims + // Note: cuDNN uses sequence_length x mini_batch_size x input_size + cuda::copy_tensor( + stream, + {mini_batch_size, sequence_length, input_size}, + input_sequence_grad_workspace.LockedBuffer(), + {input_size, mini_batch_size*input_size, one}, + input_sequence_grad.Buffer(), + {sequence_length*input_size, input_size, one}); + +} +#endif // LBANN_HAS_CUDNN + +// --------------------------------------------- +// Builder +// --------------------------------------------- + +namespace +{ + +template +struct Builder +{ + template + static std::unique_ptr Build(Args&&...) + { + LBANN_ERROR( + "Attempted to construct gru_layer with invalid parameters ", + "(TensorDataType=",TypeName(),", ", + "Layout=",to_string(Layout),", ", + "Device=",to_string(Device),")"); + return nullptr; + } +}; + +#ifdef LBANN_HAS_CUDNN +template +struct Builder +{ + template + static std::unique_ptr Build(Args&&... args) + { + constexpr auto Layout = data_layout::DATA_PARALLEL; + constexpr auto Device = El::Device::GPU; + using LayerType = gru_layer; + return make_unique(std::forward(args)...); + } +}; +#endif // LBANN_HAS_CUDNN + +} // namespace + +template +std::unique_ptr build_gru_layer_from_pbuf( + lbann_comm* comm, lbann_data::Layer const& proto_layer) +{ + using BuilderType = Builder; + LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, gru); + const auto& params = proto_layer.gru(); + return BuilderType::Build(comm, params.hidden_size()); +} + +// --------------------------------------------- +// Explicit template instantiation +// --------------------------------------------- + +/// @todo CPU implementation +#ifdef LBANN_HAS_CUDNN +#define PROTO(T) \ + template class gru_layer< \ + T, data_layout::DATA_PARALLEL, El::Device::GPU>; +#define LBANN_INSTANTIATE_CPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#endif // LBANN_HAS_CUDNN + +#define PROTO_DEVICE(T, Device) \ + LBANN_LAYER_BUILDER_ETI(gru, T, Device) +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +} // namespace lbann diff --git a/src/layers/loss/categorical_accuracy.cu b/src/layers/loss/categorical_accuracy.cu index 5b1057887e6..b70a1f6b5ad 100644 --- a/src/layers/loss/categorical_accuracy.cu +++ b/src/layers/loss/categorical_accuracy.cu @@ -166,8 +166,8 @@ void fp_gpu(lbann_comm& comm, const auto& col_comm_root = loss.RowOwner(0); // GPU objects - auto&& stream = El::GPUManager::Stream(); - auto&& event = El::GPUManager::Event(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); + auto&& event = hydrogen::cuda::GetDefaultEvent(); El::SyncInfo sync_info{stream, event}; // Initialize CUDA threads/blocks for reduction kernel diff --git a/src/layers/loss/cross_entropy.cu b/src/layers/loss/cross_entropy.cu index a8980e621ba..1a9aa30c83e 100644 --- a/src/layers/loss/cross_entropy.cu +++ b/src/layers/loss/cross_entropy.cu @@ -91,9 +91,9 @@ void local_fp_gpu(const El::AbstractMatrix& local_prediction, block_dims.x = block_size; grid_dims.x = (height + block_size - 1) / block_size; grid_dims.y = width; - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); fp_kernel - <<>>( + <<>>( height, width, local_prediction.LockedBuffer(), local_prediction.LDim(), local_ground_truth.LockedBuffer(), local_ground_truth.LDim(), @@ -147,9 +147,9 @@ void local_bp_gpu(const El::AbstractMatrix& local_prediction, block_dims.x = block_size; grid_dims.x = (height + block_size - 1) / block_size; grid_dims.y = width; - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); bp_kernel - <<>>( + <<>>( height, width, local_prediction.LockedBuffer(), local_prediction.LDim(), local_ground_truth.LockedBuffer(), local_ground_truth.LDim(), diff --git a/src/layers/loss/entrywise.cu b/src/layers/loss/entrywise.cu index bddaf92b53b..deecafdee7f 100644 --- a/src/layers/loss/entrywise.cu +++ b/src/layers/loss/entrywise.cu @@ -92,9 +92,9 @@ void apply_binary_backprop_operator( // Launch CUDA kernel if (grid_dim > 0) { - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); binary_backprop_operator_kernel - <<>>( + <<>>( height, width, x1.LockedBuffer(), x1.LDim(), x2.LockedBuffer(), x2.LDim(), diff --git a/src/layers/loss/l1_norm.cu b/src/layers/loss/l1_norm.cu index 8e5db8ef411..63dabae361d 100644 --- a/src/layers/loss/l1_norm.cu +++ b/src/layers/loss/l1_norm.cu @@ -84,9 +84,9 @@ void local_fp_gpu(const El::AbstractMatrix& local_input, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; grid_dims.y = local_width; - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); fp_kernel - <<>>( + <<>>( local_height, local_width, local_input.LockedBuffer(), local_input.LDim(), local_contribution.Buffer()); @@ -132,9 +132,9 @@ void local_bp_gpu(const El::AbstractMatrix& local_input, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; grid_dims.y = local_width; - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); bp_kernel - <<>>( + <<>>( local_height, local_width, local_input.LockedBuffer(), local_input.LDim(), local_gradient_wrt_output.LockedBuffer(), diff --git a/src/layers/loss/l2_norm2.cu b/src/layers/loss/l2_norm2.cu index 916375a776b..78c4823b0cf 100644 --- a/src/layers/loss/l2_norm2.cu +++ b/src/layers/loss/l2_norm2.cu @@ -84,9 +84,9 @@ void local_fp_gpu(const El::AbstractMatrix& local_input, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; grid_dims.y = local_width; - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); fp_kernel - <<>>( + <<>>( local_height, local_width, local_input.LockedBuffer(), local_input.LDim(), local_contribution.Buffer()); @@ -125,9 +125,9 @@ void local_bp_gpu(const El::AbstractMatrix& local_input, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; grid_dims.y = local_width; - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); bp_kernel - <<>>( + <<>>( local_height, local_width, local_input.LockedBuffer(), local_input.LDim(), local_gradient_wrt_output.LockedBuffer(), diff --git a/src/layers/loss/mean_absolute_error.cu b/src/layers/loss/mean_absolute_error.cu index 0b591b92836..ed8332a1045 100644 --- a/src/layers/loss/mean_absolute_error.cu +++ b/src/layers/loss/mean_absolute_error.cu @@ -90,9 +90,9 @@ void local_fp_gpu(El::Int height, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; grid_dims.y = local_width; - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); fp_kernel - <<>>( + <<>>( height, local_height, local_width, local_prediction.LockedBuffer(), local_prediction.LDim(), local_ground_truth.LockedBuffer(), local_ground_truth.LDim(), @@ -157,9 +157,9 @@ void local_bp_gpu(El::Int height, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; grid_dims.y = local_width; - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); bp_kernel - <<>>( + <<>>( height, local_height, local_width, local_prediction.LockedBuffer(), local_prediction.LDim(), local_ground_truth.LockedBuffer(), local_ground_truth.LDim(), diff --git a/src/layers/loss/mean_squared_error.cu b/src/layers/loss/mean_squared_error.cu index 6a404cb7fe9..5a57666299e 100644 --- a/src/layers/loss/mean_squared_error.cu +++ b/src/layers/loss/mean_squared_error.cu @@ -90,9 +90,9 @@ void local_fp_gpu(El::Int height, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; grid_dims.y = local_width; - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); fp_kernel - <<>>( + <<>>( height, local_height, local_width, local_prediction.LockedBuffer(), local_prediction.LDim(), local_ground_truth.LockedBuffer(), local_ground_truth.LDim(), @@ -148,9 +148,9 @@ void local_bp_gpu(El::Int height, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; grid_dims.y = local_width; - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); bp_kernel - <<>>( + <<>>( height, local_height, local_width, local_prediction.LockedBuffer(), local_prediction.LDim(), local_ground_truth.LockedBuffer(), local_ground_truth.LDim(), diff --git a/src/layers/loss/top_k_categorical_accuracy.cu b/src/layers/loss/top_k_categorical_accuracy.cu index a388e1ab6ff..472791d688f 100644 --- a/src/layers/loss/top_k_categorical_accuracy.cu +++ b/src/layers/loss/top_k_categorical_accuracy.cu @@ -201,8 +201,8 @@ void fp_gpu(lbann_comm& comm, const auto& col_comm_root = loss.RowOwner(0); // GPU objects - auto&& stream = El::GPUManager::Stream(); - auto&& event = El::GPUManager::Event(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); + auto&& event = hydrogen::cuda::GetDefaultEvent(); El::SyncInfo syncInfo{stream, event}; cuda::thrust::allocator<> alloc(stream); diff --git a/src/layers/math/binary.cu b/src/layers/math/binary.cu index 3367334a08c..23681272089 100644 --- a/src/layers/math/binary.cu +++ b/src/layers/math/binary.cu @@ -93,9 +93,9 @@ void apply_binary_backprop_operator(const El::AbstractMatrix& x1 // Launch CUDA kernel if (grid_dim > 0) { - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); binary_backprop_operator_kernel - <<>>( + <<>>( height, width, x1.LockedBuffer(), x1.LDim(), x2.LockedBuffer(), x2.LDim(), diff --git a/src/layers/math/clamp.cu b/src/layers/math/clamp.cu index 13947b016c6..04a0583419e 100644 --- a/src/layers/math/clamp.cu +++ b/src/layers/math/clamp.cu @@ -101,7 +101,7 @@ void local_fp(TensorDataType min, // Launch CUDA kernel if (grid_dim > 0) { - fp_kernel<<>>( + fp_kernel<<>>( min, max, height, width, input.LockedBuffer(), input.LDim(), output.Buffer(), output.LDim()); @@ -131,7 +131,7 @@ void local_bp(TensorDataType min, // Launch CUDA kernel if (grid_dim > 0) { - bp_kernel<<>>( + bp_kernel<<>>( min, max, height, width, input.LockedBuffer(), input.LDim(), gradient_wrt_output.LockedBuffer(), gradient_wrt_output.LDim(), diff --git a/src/layers/math/matmul.cpp b/src/layers/math/matmul.cpp index 4baabf62f03..d529aa7b729 100644 --- a/src/layers/math/matmul.cpp +++ b/src/layers/math/matmul.cpp @@ -173,7 +173,7 @@ void fp_compute_impl(matmul_layer::fp_compute() { grid_dims.y = num_channels; grid_dims.z = local_width; mean_kernel - <<>>( + <<>>( num_channels, channel_size, local_width, local_input.LockedBuffer(), local_input.LDim(), local_output.Buffer(), local_output.LDim()); @@ -171,7 +171,7 @@ void channelwise_mean_layer::bp_compute() { grid_dims.x = (channel_size + block_size - 1) / block_size; grid_dims.y = num_channels; grid_dims.z = local_width; - backprop_kernel<<>>( + backprop_kernel<<>>( num_channels, channel_size, local_width, local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), diff --git a/src/layers/misc/channelwise_softmax.cu b/src/layers/misc/channelwise_softmax.cu index 641d7708e7e..842d9f60da8 100644 --- a/src/layers/misc/channelwise_softmax.cu +++ b/src/layers/misc/channelwise_softmax.cu @@ -235,7 +235,7 @@ void fp_impl(size_t num_channels, grid_dims.z = local_mini_batch_size; LocalMat maxvals(grid_dims.x * num_channels, local_mini_batch_size); fp_max_kernel - <<>>( + <<>>( {local_mini_batch_size, num_channels, channel_size}, local_input.LockedBuffer(), {static_cast(local_input.LDim()), channel_size, 1}, @@ -247,7 +247,7 @@ void fp_impl(size_t num_channels, const LocalMat prev_maxvals(std::move(maxvals)); maxvals.Resize(grid_dims.x * num_channels, local_mini_batch_size); fp_max_kernel - <<>>( + <<>>( {local_mini_batch_size, num_channels, prev_dim}, prev_maxvals.LockedBuffer(), {static_cast(prev_maxvals.LDim()), prev_dim, 1}, @@ -268,7 +268,7 @@ void fp_impl(size_t num_channels, grid_dims.y = num_channels; grid_dims.z = local_mini_batch_size; fp_denom_kernel - <<>>( + <<>>( {local_mini_batch_size, num_channels, channel_size}, local_input.LockedBuffer(), {static_cast(local_input.LDim()), channel_size, 1}, @@ -285,7 +285,7 @@ void fp_impl(size_t num_channels, grid_dims.y = num_channels; grid_dims.z = local_mini_batch_size; fp_output_kernel - <<>>( + <<>>( {local_mini_batch_size, num_channels, channel_size}, local_input.LockedBuffer(), {static_cast(local_input.LDim()), channel_size, 1}, @@ -446,7 +446,7 @@ void bp_impl(size_t num_channels, grid_dims.y = num_channels; grid_dims.z = local_mini_batch_size; bp_y_dot_dy_kernel - <<>>( + <<>>( {local_mini_batch_size, num_channels, channel_size}, local_output.LockedBuffer(), {static_cast(local_output.LDim()), channel_size, 1}, @@ -464,7 +464,7 @@ void bp_impl(size_t num_channels, grid_dims.y = num_channels; grid_dims.z = local_mini_batch_size; bp_input_grad_kernel - <<>>( + <<>>( {local_mini_batch_size, num_channels, channel_size}, local_output.LockedBuffer(), {static_cast(local_output.LDim()), channel_size, 1}, diff --git a/src/layers/misc/covariance.cu b/src/layers/misc/covariance.cu index 91c906b676c..9a488d18955 100644 --- a/src/layers/misc/covariance.cu +++ b/src/layers/misc/covariance.cu @@ -209,7 +209,7 @@ void fp_gpu(const El::AbstractDistMatrix& input0, grid_dims.y = local_width; const auto& scale = El::TypeTraits::One() / TensorDataType(height); mean_contribution_kernel - <<>>( + <<>>( local_height, local_width, scale, local_input0.LockedBuffer(), local_input0.LDim(), local_input1.LockedBuffer(), local_input1.LDim(), @@ -229,7 +229,7 @@ void fp_gpu(const El::AbstractDistMatrix& input0, grid_dims.y = local_width; const auto& scale = El::TypeTraits::One() / (biased ? TensorDataType(height) : TensorDataType(height - 1)); covariance_contribution_kernel - <<>>( + <<>>( local_height, local_width, scale, local_input0.LockedBuffer(), local_input0.LDim(), local_input1.LockedBuffer(), local_input1.LDim(), @@ -276,7 +276,7 @@ void bp_gpu(const El::AbstractDistMatrix& input0, El::Int grid_size = (local_height * local_width + block_size - 1) / block_size; if (grid_size > 0) { covariance_backprop_kernel - <<>>( + <<>>( local_height, local_width, scale, local_workspace.LockedBuffer(), local_input0.LockedBuffer(), local_input0.LDim(), diff --git a/src/layers/misc/dist_embedding.cpp b/src/layers/misc/dist_embedding.cpp index 735ca420f8e..f8f135bf9a9 100644 --- a/src/layers/misc/dist_embedding.cpp +++ b/src/layers/misc/dist_embedding.cpp @@ -366,56 +366,27 @@ struct Builder } }; -template <> -struct Builder -{ - template - static std::unique_ptr Build(Args&&... args) - { - using TensorDataType = float; - constexpr data_layout Layout = data_layout::DATA_PARALLEL; - constexpr El::Device Device = El::Device::CPU; +#define DEFINE_BUILDER(TensorDataType, Device) \ +template <> \ +struct Builder \ +{ \ + template \ + static std::unique_ptr Build(Args&&... args) \ + { \ + constexpr data_layout Layout = data_layout::DATA_PARALLEL; \ + using LayerType = dist_embedding_layer; \ + return make_unique(std::forward(args)...); \ + } \ +} #ifdef LBANN_HAS_SHMEM - using LayerType = dist_embedding_layer; - return make_unique(std::forward(args)...); -#else - LBANN_ERROR( - "Attempted to construct CPU dist_embedding_layer, ", - "but LBANN has not been built with OpenSHMEM support " - "(TensorDataType=",TypeName(),", ", - "Layout=",to_string(Layout),", ", - "Device=",to_string(Device),")"); - return nullptr; +DEFINE_BUILDER(float, El::Device::CPU); +DEFINE_BUILDER(double, El::Device::CPU); #endif // LBANN_HAS_SHMEM - } -}; - -#ifdef LBANN_HAS_GPU -template <> -struct Builder -{ - template - static std::unique_ptr Build(Args&&... args) - { - using TensorDataType = float; - constexpr data_layout Layout = data_layout::DATA_PARALLEL; - constexpr El::Device Device = El::Device::GPU; -#ifdef LBANN_HAS_NVSHMEM - using LayerType = dist_embedding_layer; - return make_unique(std::forward(args)...); -#else - LBANN_ERROR( - "Attempted to construct GPU dist_embedding_layer, ", - "but LBANN has not been built with NVSHMEM support " - "(TensorDataType=",TypeName(),", ", - "Layout=",to_string(Layout),", ", - "Device=",to_string(Device),")"); - return nullptr; -#endif // LBANN_HAS_NVSHMEM - } - -}; -#endif // LBANN_HAS_GPU +#if defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM) +DEFINE_BUILDER(float, El::Device::GPU); +DEFINE_BUILDER(double, El::Device::GPU); +#endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM) +#undef DEFINE_BUILDER } // namespace @@ -444,10 +415,14 @@ std::unique_ptr build_dist_embedding_layer_from_pbuf( #ifdef LBANN_HAS_SHMEM template class dist_embedding_layer< float, data_layout::DATA_PARALLEL, El::Device::CPU>; +template class dist_embedding_layer< + double, data_layout::DATA_PARALLEL, El::Device::CPU>; #endif // LBANN_HAS_SHMEM #if defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM) extern template class dist_embedding_layer< float, data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class dist_embedding_layer< + double, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM) #define PROTO_DEVICE(T, Device) \ diff --git a/src/layers/misc/dist_embedding.cu b/src/layers/misc/dist_embedding.cu index 557cc04a499..b6b8901f9f8 100644 --- a/src/layers/misc/dist_embedding.cu +++ b/src/layers/misc/dist_embedding.cu @@ -341,7 +341,7 @@ void dist_embedding_layer::fp_compute() { const size_t local_mini_batch_size = local_input.Width(); // GPU objects - auto&& stream = El::GPUManager::Stream(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); nvshmem::initialize(); // Barrier to handle gradient checking @@ -523,7 +523,7 @@ void dist_embedding_layer::bp_compute() { const size_t local_mini_batch_size = local_output_grad.Width(); // GPU objects - auto&& stream = El::GPUManager::Stream(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); // Synchronize non-blocking barrier // Note: Make sure NVSHMEM workspaces are ready to recieve gradients. @@ -653,7 +653,7 @@ void dist_embedding_layer::apply_sparse_sgd_step( LocalMat& local_embeddings) { // GPU objects - auto&& stream = El::GPUManager::Stream(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); // Synchronize non-blocking barrier // Note: Make sure gradients have been received. @@ -696,6 +696,8 @@ void dist_embedding_layer::apply_sparse_sgd_step( /// @todo fp16 template class dist_embedding_layer< float, data_layout::DATA_PARALLEL, El::Device::GPU>; +template class dist_embedding_layer< + double, data_layout::DATA_PARALLEL, El::Device::GPU>; } // namespace lbann #endif // LBANN_HAS_NVSHMEM diff --git a/src/layers/misc/one_hot.cu b/src/layers/misc/one_hot.cu index 2ebfb92a9a8..8053ac71528 100644 --- a/src/layers/misc/one_hot.cu +++ b/src/layers/misc/one_hot.cu @@ -75,7 +75,7 @@ void one_hot_layer::fp_compute() { const size_t local_width = local_output.Width(); constexpr size_t block_size = 64; const size_t grid_size = (local_width + block_size - 1) / block_size; - fp_kernel<<>>( + fp_kernel<<>>( local_height, local_width, local_input.LockedBuffer(), diff --git a/src/layers/misc/variance.cu b/src/layers/misc/variance.cu index 8c70b7bb9aa..0a7ea7b08ef 100644 --- a/src/layers/misc/variance.cu +++ b/src/layers/misc/variance.cu @@ -150,7 +150,7 @@ void fp_gpu(const El::AbstractDistMatrix& input, grid_dims.y = local_width; const auto& scale = El::TypeTraits::One() / (biased ? TensorDataType(height) : TensorDataType(height - 1)); variance_contribution_kernel - <<>>( + <<>>( local_height, local_width, scale, local_input.LockedBuffer(), local_input.LDim(), local_means.LockedBuffer(), @@ -192,7 +192,7 @@ void bp_gpu(const El::AbstractDistMatrix& input, El::Int grid_size = (local_height * local_width + block_size - 1) / block_size; if (grid_size > 0) { variance_backprop_kernel - <<>>( + <<>>( local_height, local_width, scale, local_workspace.LockedBuffer(), local_input.LockedBuffer(), local_input.LDim(), diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu index c81cb076fe2..abbb8bf11b3 100644 --- a/src/layers/regularizers/batch_normalization.cu +++ b/src/layers/regularizers/batch_normalization.cu @@ -33,66 +33,89 @@ namespace lbann { namespace { -/** CUDA kernel to compute channel sums. - * Sums and squares of sums are used to compute mean and variance. +/** Functor for adding arrays. */ +template +struct array_sum +{ + using ArrayType = cuda::array; + __device__ __forceinline__ + ArrayType operator()(const ArrayType& x, const ArrayType& y) + { + ArrayType sum; +#pragma unroll + for (size_t i = 0; i < N; ++i) { + sum[i] = x[i] + y[i]; + } + return sum; + } +}; + +/** Accumulate sums and sums of squares for each channel. + * + * On input, sums and sqsums are assumed to be filled with zeros. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (channel_size / bsize) x num_channels x 1 */ -template -__global__ void channel_sums_kernel( - El::Int channel_height, - El::Int width, - const TensorDataType * __restrict__ data, El::Int data_ldim, +template +__global__ void fp_sums_kernel( + int mini_batch_size, + int num_channels, + int channel_size, + const TensorDataType * __restrict__ data, int data_ldim, TensorDataType * __restrict__ sums, TensorDataType * __restrict__ sqsums) { - // Indices - const El::Int tid = threadIdx.x; - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - - // Initialize shared memory - __shared__ TensorDataType shared_sums[block_size]; - __shared__ TensorDataType shared_sqsums[block_size]; - - // Compute row sums in shared memory - TensorDataType private_sum = 0; - TensorDataType private_sqsum = 0; - if (gidx < channel_height) { - const auto& row = gidx + bidy * channel_height; - for (El::Int col = 0; col < width; ++col) { - const auto& x = data[row + col * data_ldim]; - private_sum += x; - private_sqsum += x * x; + // Indices and dimensions + constexpr int bdimy = 1; + constexpr int bdimz = 1; + const auto& tid = threadIdx.x; + const auto& gidx = threadIdx.x + blockIdx.x * blockDim.x; + const auto& gidy = blockIdx.y; + const auto& nthreadsx = blockDim.x * gridDim.x; + const auto& nthreadsy = gridDim.y; + + for (int channel = gidy; channel < num_channels; channel += nthreadsy) { + + // Accumulate sums and perform block-wide reduction + using array_t = cuda::array; + using array_sum_t = array_sum; + array_t sum_sqsum; + sum_sqsum[0] = TensorDataType(0); + sum_sqsum[1] = TensorDataType(0); + for (int i = gidx; i < channel_size; i += nthreadsx) { + for (int j = 0; j < mini_batch_size; ++j) { + const auto& x = data[i + channel*channel_size + j*data_ldim]; + sum_sqsum[0] += x; + sum_sqsum[1] += x * x; + } } - } - shared_sums[tid] = private_sum; - shared_sqsums[tid] = private_sqsum; - - // Compute channel sum with shared memory reduction - /// @todo unroll loops - for (El::Int stride = block_size / 2; stride > 0; stride /= 2) { - __syncthreads(); - if(tid < stride) { - shared_sums[tid] += shared_sums[tid + stride]; - shared_sqsums[tid] += shared_sqsums[tid + stride]; + sum_sqsum = cuda::block_reduce(sum_sqsum); + + // Output result to global memory + if (tid == 0) { + cuda::atomic_add(&sums[channel], sum_sqsum[0]); + cuda::atomic_add(&sqsums[channel], sum_sqsum[1]); } - } - // Output channel sum to global memory - if (tid == 0) { - cuda::atomic_add(&sums[bidy], shared_sums[0]); - cuda::atomic_add(&sqsums[bidy], shared_sqsums[0]); } } -/** CUDA kernel to compute statistics. +/** Compute statistics for each channel. + * * On input, global_mean and global_var are assumed to contain sums * and squares of sums, respectively. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (num_channels / bsize) x 1 x 1 */ template -__global__ void compute_statistics_kernel( - El::Int num_sums, - El::Int num_per_sum, +__global__ void fp_statistics_kernel( + int num_sums, + int num_per_sum, TensorDataType epsilon, TensorDataType decay, TensorDataType * __restrict__ global_mean, @@ -100,9 +123,9 @@ __global__ void compute_statistics_kernel( TensorDataType * __restrict__ global_running_mean, TensorDataType * __restrict__ global_running_var) { - const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int num_threads = blockDim.x * gridDim.x; - for (El::Int i = gid; i < num_sums; i += num_threads) { + const auto& gid = threadIdx.x + blockIdx.x * blockDim.x; + const auto& num_threads = blockDim.x * gridDim.x; + for (auto i = gid; i < num_sums; i += num_threads) { TensorDataType num_per_sum_dt = TensorDataType(num_per_sum); // Compute mean and variance @@ -123,54 +146,79 @@ __global__ void compute_statistics_kernel( } -/** CUDA kernel to apply batch normalization. */ -template -__global__ void batch_normalization_kernel( - El::Int channel_height, - El::Int width, - const TensorDataType * __restrict__ global_input, El::Int input_ldim, +/** Compute outputs. + * + * y_i = (x_i - mean) / sqrt(var + epsilon) + * + * Block dimensions: bdimx x bdimy x bdimz + * + * Grid dimensions: (channel_size / bdimx) x (mini_batch_size / bdimy) x (num_channels / bdimz) + * + */ +template +__global__ void fp_output_kernel( + int mini_batch_size, + int num_channels, + int channel_size, + const TensorDataType * __restrict__ global_input, int input_ldim, const TensorDataType * __restrict__ global_mean, const TensorDataType * __restrict__ global_var, TensorDataType epsilon, const TensorDataType * __restrict__ global_scale, const TensorDataType * __restrict__ global_bias, - TensorDataType * __restrict__ global_output, El::Int output_ldim) { - - // Indices - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - - // Copy batch normalization parameters to private memory - const auto& mean = global_mean[bidy]; - const auto& var = global_var[bidy]; - const auto& scale = global_scale[bidy]; - const auto& bias = global_bias[bidy]; - - // Get reciprocal of standard deviation - const auto& inv_stdev = cuda::rsqrt(var + epsilon); - - // Apply batch normalization - if (gidx < channel_height) { - const auto& row = gidx + bidy * channel_height; - for (El::Int col = 0; col < width; ++col) { - const auto& x = global_input[row + col * input_ldim]; - const auto& xhat = (x - mean) * inv_stdev; - const auto& y = scale * xhat + bias; - global_output[row + col * output_ldim] = y; + TensorDataType * __restrict__ global_output, int output_ldim) { + + // Indices and dimensions + const auto& gidx = threadIdx.x + blockIdx.x * blockDim.x; + const auto& gidy = threadIdx.y + blockIdx.y * blockDim.y; + const auto& gidz = threadIdx.z + blockIdx.z * blockDim.z; + const auto& nthreadsx = blockDim.x * gridDim.x; + const auto& nthreadsy = blockDim.y * gridDim.y; + const auto& nthreadsz = blockDim.z * gridDim.z; + + for (auto k = gidz; k < num_channels; k += nthreadsz) { + const auto& mean = global_mean[k]; + const auto& var = global_var[k]; + const auto& inv_stdev = cuda::rsqrt(var + epsilon); + const auto& scale = global_scale[k]; + const auto& bias = global_bias[k]; + for (auto j = gidy; j < mini_batch_size; j += nthreadsy) { + for (auto i = gidx; i < channel_size; i += nthreadsx) { + const auto& x = global_input[i + k*channel_size + j*input_ldim]; + const auto& xhat = (x - mean) * inv_stdev; + const auto& y = scale * xhat + bias; + global_output[i + k*channel_size + j*output_ldim] = y; + } } } } -/** CUDA kernel to compute gradients w.r.t. batch norm parameters. */ -template -__global__ void backprop1_kernel( - El::Int channel_height, - El::Int width, +/** Compute gradients w.r.t. statistics and affine transform. + * + * dL/dscale = sum(dL/dy_i * xhat_i) + * + * dL/dbias = sum(dL/dy_i) + * + * dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon) + * + * dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2 + * + * On input, means_grad and vars_grad are filled with zeros. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (channel_size / bsize) x num_channels x 1 + */ +template +__global__ void bp_statistics_grad_kernel( + int mini_batch_size, + int num_channels, + int channel_size, const TensorDataType * __restrict__ global_input, - El::Int input_ldim, + int input_ldim, const TensorDataType * __restrict__ global_gradient_wrt_output, - El::Int gradient_wrt_output_ldim, + int gradient_wrt_output_ldim, const TensorDataType * __restrict__ global_mean, const TensorDataType * __restrict__ global_var, TensorDataType epsilon, @@ -180,82 +228,82 @@ __global__ void backprop1_kernel( TensorDataType * __restrict__ global_dmean, TensorDataType * __restrict__ global_dvar) { - // Indices - const El::Int tid = threadIdx.x; - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - - // Initialize shared memory - __shared__ TensorDataType shared_dscale[block_size]; - __shared__ TensorDataType shared_dbias[block_size]; - __shared__ TensorDataType shared_dmean[block_size]; - __shared__ TensorDataType shared_dvar[block_size]; - - // Copy batch normalization parameters to private memory - const auto& mean = global_mean[bidy]; - const auto& var = global_var[bidy]; - const auto& scale = global_scale[bidy]; - - // Compute useful constants - const TensorDataType zero = TensorDataType(0); - const auto& inv_stdev = cuda::rsqrt(var + epsilon); - const auto& dvar_factor = inv_stdev * inv_stdev * inv_stdev / TensorDataType(2); - - // Compute row-wise gradient contributions in shared memory - auto dscale = zero; - auto dbias = zero; - auto dmean = zero; - auto dvar = zero; - if (gidx < channel_height) { - const auto& row = gidx + bidy * channel_height; - for(El::Int col = 0; col < width; ++col) { - const auto& x = global_input[row + col * input_ldim]; - const auto& xhat = (x - mean) * inv_stdev; - const auto& dy = global_gradient_wrt_output[row + col * gradient_wrt_output_ldim]; - dscale += dy * xhat; - dbias += dy; - const auto& dxhat = dy * scale; - dmean += - dxhat * inv_stdev; - dvar += - dxhat * (x - mean) * dvar_factor; + // Indices and dimensions + constexpr int bdimy = 1; + constexpr int bdimz = 1; + const auto& tid = threadIdx.x; + const auto& gidx = threadIdx.x + blockIdx.x * blockDim.x; + const auto& gidy = blockIdx.y; + const auto& nthreadsx = blockDim.x * gridDim.x; + const auto& nthreadsy = gridDim.y; + + for (int channel = gidy; channel < num_channels; channel += nthreadsy) { + + // Copy batch normalization parameters to private memory + const auto& mean = global_mean[channel]; + const auto& var = global_var[channel]; + const auto& scale = global_scale[channel]; + + // Compute useful constants + const auto& inv_stdev = cuda::rsqrt(var + epsilon); + const auto& dvar_factor = inv_stdev * inv_stdev * inv_stdev * TensorDataType(0.5); + + // Accumulate sums and perform block-wide reduction + using array_t = cuda::array; + using array_sum_t = array_sum; + array_t sums; + sums[0] = TensorDataType(0); + sums[1] = TensorDataType(0); + sums[2] = TensorDataType(0); + sums[3] = TensorDataType(0); + for (int i = gidx; i < channel_size; i += nthreadsx) { + for (int j = 0; j < mini_batch_size; ++j) { + const auto& x = global_input[i + channel*channel_size + j*input_ldim]; + const auto& xhat = (x - mean) * inv_stdev; + const auto& dy = global_gradient_wrt_output[i + + channel*channel_size + + j*gradient_wrt_output_ldim]; + sums[0] += dy * xhat; + sums[1] += dy; + const auto& dxhat = dy * scale; + sums[2] -= dxhat * inv_stdev; + sums[3] -= dxhat * (x - mean) * dvar_factor; + } } - } - shared_dscale[tid] = dscale; - shared_dbias[tid] = dbias; - shared_dmean[tid] = dmean; - shared_dvar[tid] = dvar; - - // Compute gradients with shared memory reduction - // @todo unroll loops - for (El::Int stride = block_size / 2; stride > 0; stride /= 2) { - __syncthreads(); - if (tid < stride) { - shared_dscale[tid] += shared_dscale[tid + stride]; - shared_dbias[tid] += shared_dbias[tid + stride]; - shared_dmean[tid] += shared_dmean[tid + stride]; - shared_dvar[tid] += shared_dvar[tid + stride]; + sums = cuda::block_reduce(sums); + + // Output result to global memory + if (tid == 0) { + cuda::atomic_add(&global_dscale[channel], sums[0]); + cuda::atomic_add(&global_dbias[channel], sums[1]); + cuda::atomic_add(&global_dmean[channel], sums[2]); + cuda::atomic_add(&global_dvar[channel], sums[3]); } - } - // Output channel sum to global memory - if (tid == 0) { - cuda::atomic_add(&global_dscale[bidy], shared_dscale[0]); - cuda::atomic_add(&global_dbias[bidy], shared_dbias[0]); - cuda::atomic_add(&global_dmean[bidy], shared_dmean[0]); - cuda::atomic_add(&global_dvar[bidy], shared_dvar[0]); } } -/** CUDA kernel to compute gradients w.r.t. input. */ -template -__global__ void backprop2_kernel( - El::Int channel_height, - El::Int local_width, - El::Int num_per_sum, +/** Compute gradients w.r.t. input. + * + * dL/dx_i = ( dL/dxhat_i / sqrt(var+epsilon) + * + dL/dmean / n + * + dL/dvar * (x_i - mean) * 2/(n-1) ) + * + * Block dimensions: bdimx x bdimy x bdimz + * + * Grid dimensions: (channel_size / bdimx) x (mini_batch_size / bdimy) x (num_channels / bdimz) + */ +template +__global__ void bp_input_grad_kernel( + int mini_batch_size, + int num_channels, + int channel_size, + int num_per_sum, const TensorDataType * __restrict__ global_input, - El::Int input_ldim, + int input_ldim, const TensorDataType * __restrict__ global_gradient_wrt_output, - El::Int gradient_wrt_output_ldim, + int gradient_wrt_output_ldim, const TensorDataType * __restrict__ global_mean, const TensorDataType * __restrict__ global_var, TensorDataType epsilon, @@ -263,33 +311,33 @@ __global__ void backprop2_kernel( const TensorDataType * __restrict__ global_dmean, const TensorDataType * __restrict__ global_dvar, TensorDataType * __restrict__ global_gradient_wrt_input, - El::Int gradient_wrt_input_ldim) { - - // Indices - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - - // Copy batch normalization parameters to private memory - const auto& mean = global_mean[bidy]; - const auto& var = global_var[bidy]; - const auto& scale = global_scale[bidy]; - const auto& dmean = global_dmean[bidy]; - const auto& dvar = global_dvar[bidy]; - - // Compute useful constants - const auto& inv_stdev = cuda::rsqrt(var + epsilon); - const auto& dmean_term = dmean / TensorDataType(num_per_sum); - const auto& dvar_term = dvar * TensorDataType(2) / TensorDataType(num_per_sum - 1); - - // Apply batch normalization - if (gidx < channel_height) { - const auto& row = gidx + bidy * channel_height; - for (El::Int col = 0; col < local_width; ++col) { - const auto& x = global_input[row + col * input_ldim]; - const auto& dy = global_gradient_wrt_output[row + col * gradient_wrt_output_ldim]; - const auto& dxhat = dy * scale; - auto& dx = global_gradient_wrt_input[row + col * gradient_wrt_input_ldim]; - dx = dxhat * inv_stdev + dmean_term + dvar_term * (x - mean); + int gradient_wrt_input_ldim) { + + // Indices and dimensions + const auto& gidx = threadIdx.x + blockIdx.x * blockDim.x; + const auto& gidy = threadIdx.y + blockIdx.y * blockDim.y; + const auto& gidz = threadIdx.z + blockIdx.z * blockDim.z; + const auto& nthreadsx = blockDim.x * gridDim.x; + const auto& nthreadsy = blockDim.y * gridDim.y; + const auto& nthreadsz = blockDim.z * gridDim.z; + + for (auto k = gidz; k < num_channels; k += nthreadsz) { + const auto& mean = global_mean[k]; + const auto& var = global_var[k]; + const auto& inv_stdev = cuda::rsqrt(var + epsilon); + const auto& scale = global_scale[k]; + const auto& dmean = global_dmean[k]; + const auto& dvar = global_dvar[k]; + const auto& dmean_term = dmean / TensorDataType(num_per_sum); + const auto& dvar_term = dvar * TensorDataType(2) / TensorDataType(num_per_sum - 1); + for (auto j = gidy; j < mini_batch_size; j += nthreadsy) { + for (auto i = gidx; i < channel_size; i += nthreadsx) { + const auto& x = global_input[i + k*channel_size + j*input_ldim]; + const auto& dy = global_gradient_wrt_output[i + k*channel_size + j*gradient_wrt_output_ldim]; + const auto& dxhat = dy * scale; + auto& dx = global_gradient_wrt_input[i + k*channel_size + j*gradient_wrt_input_ldim]; + dx = dxhat * inv_stdev + dmean_term + dvar_term * (x - mean); + } } } @@ -312,9 +360,9 @@ void batch_normalization_distconv_adapter::fp_com const bool is_training = l.m_model->get_execution_context().get_execution_mode() == execution_mode::training; auto& local_running_mean = - ValuesGetter::mutable_values(this->get_weights(2)).Matrix(); + ValuesGetter::mutable_values(l.get_weights(2)).Matrix(); auto& local_running_var = - ValuesGetter::mutable_values(this->get_weights(3)).Matrix(); + ValuesGetter::mutable_values(l.get_weights(3)).Matrix(); assert0(dc::tensor::View( m_scale, l.weights_values(0).LockedMatrix().LockedBuffer())); @@ -405,8 +453,8 @@ void batch_normalization_layer::fp_compute() { const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training; // CUDA objects - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); - auto&& stream = El::GPUManager::Stream(); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); + auto&& stream = hydrogen::cuda::GetDefaultStream(); // Matrices const auto& input = this->get_prev_activations(); @@ -436,18 +484,21 @@ void batch_normalization_layer::fp_compute() { El::Zero(local_mean); El::Zero(local_var); if (!local_input.IsEmpty()) { - const El::Int block_size = 256; + constexpr int block_size = 256; dim3 block_dims, grid_dims; block_dims.x = block_size; grid_dims.x = (channel_size + block_size - 1) / block_size; - grid_dims.y = num_channels; - channel_sums_kernel + grid_dims.y = El::Min(num_channels, 65535); + fp_sums_kernel <<>>( - channel_size, local_width, + local_width, + num_channels, + channel_size, local_input.LockedBuffer(), local_input.LDim(), - local_mean.Buffer(), local_var.Buffer()); + local_mean.Buffer(), + local_var.Buffer()); } - El::Int num_per_sum; + int num_per_sum; if (this->m_statistics_group_size == 0) { // Global statistics aggregation; allreduce on fused buffer. this->m_comm->allreduce(*this->m_mean_and_var, this->m_mean_and_var->RedundantComm(), @@ -475,9 +526,10 @@ void batch_normalization_layer::fp_compute() { if (num_per_sum <= 1) { El::Fill(local_var, TensorDataType(1.0)); } else if (num_channels > 0) { - const El::Int block_dim = 256; - const El::Int grid_dim = (num_channels + block_dim - 1) / block_dim; - compute_statistics_kernel<<>>( + constexpr size_t block_dim = 256; + const size_t grid_dim = El::Min((num_channels + block_dim - 1) / block_dim, + 65535); + fp_statistics_kernel<<>>( num_channels, num_per_sum, this->m_epsilon, this->m_decay, local_mean.Buffer(), local_var.Buffer(), local_running_mean.Buffer(), local_running_var.Buffer()); @@ -495,14 +547,15 @@ void batch_normalization_layer::fp_compute() { this->m_var_v->LockedMatrix() : this->weights_values(3).LockedMatrix()); if (!local_input.IsEmpty()) { - const El::Int block_size = 256; + constexpr int block_size = 256; dim3 block_dims, grid_dims; block_dims.x = block_size; grid_dims.x = (channel_size + block_size - 1) / block_size; - grid_dims.y = num_channels; - batch_normalization_kernel + grid_dims.y = El::Min(local_width, 65535); + grid_dims.z = El::Min(num_channels, 65535); + fp_output_kernel <<>>( - channel_size, local_width, + local_width, num_channels, channel_size, local_input.LockedBuffer(), local_input.LDim(), local_mean.LockedBuffer(), local_var.LockedBuffer(), this->m_epsilon, local_scale.LockedBuffer(), local_bias.LockedBuffer(), @@ -523,8 +576,8 @@ void batch_normalization_layer::bp_compute() { const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training; // CUDA objects - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); - auto&& stream = El::GPUManager::Stream(); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); + auto&& stream = hydrogen::cuda::GetDefaultStream(); // Matrices const auto& local_scale = this->weights_values(0).LockedMatrix(); @@ -557,14 +610,14 @@ void batch_normalization_layer::bp_compute() { El::Zero(local_mean_gradient); El::Zero(local_var_gradient); if (!local_input.IsEmpty()) { - const El::Int block_size = 256; + constexpr int block_size = 256; dim3 block_dims, grid_dims; block_dims.x = block_size; grid_dims.x = (channel_size + block_size - 1) / block_size; - grid_dims.y = num_channels; - backprop1_kernel + grid_dims.y = El::Min(num_channels, 65535); + bp_statistics_grad_kernel <<>>( - channel_size, local_width, + local_width, num_channels, channel_size, local_input.LockedBuffer(), local_input.LDim(), local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), local_mean.LockedBuffer(), local_var.LockedBuffer(), this->m_epsilon, @@ -600,7 +653,7 @@ void batch_normalization_layer::bp_compute() { } // Compute error signal - El::Int num_per_sum; + int num_per_sum; if (this->m_statistics_group_size == 0) { // Global statistics aggregation. num_per_sum = channel_size * width; @@ -614,14 +667,15 @@ void batch_normalization_layer::bp_compute() { if (num_per_sum <= 1) { El::Zero(local_gradient_wrt_input); } else if (!local_input.IsEmpty()) { - const El::Int block_size = 256; + constexpr int block_size = 256; dim3 block_dims, grid_dims; block_dims.x = block_size; grid_dims.x = (channel_size + block_size - 1) / block_size; - grid_dims.y = num_channels; - backprop2_kernel + grid_dims.y = El::Min(local_width, 65535); + grid_dims.z = El::Min(num_channels, 65535); + bp_input_grad_kernel <<>>( - channel_size, local_width, num_per_sum, + local_width, num_channels, channel_size, num_per_sum, local_input.LockedBuffer(), local_input.LDim(), local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), local_mean.LockedBuffer(), local_var.LockedBuffer(), this->m_epsilon, diff --git a/src/layers/regularizers/entrywise_batch_normalization.cu b/src/layers/regularizers/entrywise_batch_normalization.cu index 3acd10ba1ab..ba0133a3148 100644 --- a/src/layers/regularizers/entrywise_batch_normalization.cu +++ b/src/layers/regularizers/entrywise_batch_normalization.cu @@ -127,7 +127,7 @@ void compute_batch_statistics(lbann_comm& comm, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; row_sums_kernel - <<>>( + <<>>( local_height, local_width, local_input.LockedBuffer(), @@ -155,7 +155,7 @@ void compute_batch_statistics(lbann_comm& comm, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; compute_statistics_kernel - <<>>( + <<>>( local_height, statistics_count, decay, @@ -219,7 +219,7 @@ void apply_batchnorm(DataType epsilon, grid_dims.x = (local_height + block_size_x - 1) / block_size_x; grid_dims.y = (local_width + block_size_y - 1) / block_size_y; batchnorm_kernel - <<>>( + <<>>( local_height, local_width, epsilon, @@ -419,7 +419,7 @@ void bp_training_impl(lbann_comm& comm, block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; bp_training_stats_gradient_kernel - <<>>( + <<>>( local_height, local_width, epsilon, @@ -452,7 +452,7 @@ void bp_training_impl(lbann_comm& comm, grid_dims.x = (local_height + block_size_x - 1) / block_size_x; grid_dims.y = (local_width + block_size_y - 1) / block_size_y; bp_training_error_signal_kernel - <<>>( + <<>>( local_height, local_width, epsilon, @@ -530,7 +530,7 @@ void bp_inference_impl(DataType epsilon, grid_dims.x = (local_height + block_size_x - 1) / block_size_x; grid_dims.y = (local_width + block_size_y - 1) / block_size_y; bp_inference_kernel - <<>>( + <<>>( local_height, local_width, epsilon, diff --git a/src/layers/regularizers/instance_norm.cu b/src/layers/regularizers/instance_norm.cu index f1b0a7f4775..b256d6c9b5d 100644 --- a/src/layers/regularizers/instance_norm.cu +++ b/src/layers/regularizers/instance_norm.cu @@ -208,7 +208,7 @@ void fp_impl(lbann_comm& comm, grid_dims.y = num_channels; grid_dims.z = local_mini_batch_size; fp_sums_kernel - <<>>( + <<>>( local_mini_batch_size, num_channels, channel_size, local_input.LockedBuffer(), local_input.LDim(), local_sums.Buffer(), local_sums.LDim(), @@ -223,7 +223,7 @@ void fp_impl(lbann_comm& comm, grid_dims.x = (channel_size + block_size - 1) / block_size; grid_dims.y = num_channels; grid_dims.z = local_mini_batch_size; - fp_output_kernel<<>>( + fp_output_kernel<<>>( local_mini_batch_size, num_channels, channel_size, epsilon, local_input.LockedBuffer(), local_input.LDim(), local_output.Buffer(), local_output.LDim(), @@ -454,7 +454,7 @@ void bp_impl(lbann_comm& comm, grid_dims.y = num_channels; grid_dims.z = local_mini_batch_size; bp_statistics_grad_kernel - <<>>( + <<>>( local_mini_batch_size, num_channels, channel_size, epsilon, local_input.LockedBuffer(), local_input.LDim(), local_output_grad.LockedBuffer(), local_output_grad.LDim(), @@ -473,7 +473,7 @@ void bp_impl(lbann_comm& comm, grid_dims.y = num_channels; grid_dims.z = local_mini_batch_size; bp_input_grad_kernel - <<>>( + <<>>( local_mini_batch_size, num_channels, channel_size, epsilon, local_input.LockedBuffer(), local_input.LDim(), local_output_grad.LockedBuffer(), local_output_grad.LDim(), diff --git a/src/layers/regularizers/layer_norm.cu b/src/layers/regularizers/layer_norm.cu index 11b55d7cce2..cd2ec15072f 100644 --- a/src/layers/regularizers/layer_norm.cu +++ b/src/layers/regularizers/layer_norm.cu @@ -202,7 +202,7 @@ void fp_impl(lbann_comm& comm, block_dims.x = block_size; grid_dims.x = (local_sample_size + block_size - 1) / block_size; grid_dims.y = local_num_samples; - fp_sums_kernel<<>>( + fp_sums_kernel<<>>( local_num_samples, local_sample_size, local_input.LockedBuffer(), local_input.LDim(), local_means.Buffer(), local_means.LDim(), @@ -220,7 +220,7 @@ void fp_impl(lbann_comm& comm, dim3 block_dims, grid_dims; block_dims.x = block_size; grid_dims.x = (local_num_samples + block_size - 1) / block_size; - fp_statistics_kernel<<>>( + fp_statistics_kernel<<>>( sample_size, local_num_samples, local_means.Buffer(), local_means.LDim(), local_vars.Buffer(), local_vars.LDim()); @@ -233,7 +233,7 @@ void fp_impl(lbann_comm& comm, block_dims.x = block_size; grid_dims.x = (local_sample_size + block_size - 1) / block_size; grid_dims.y = local_num_samples; - fp_output_kernel<<>>( + fp_output_kernel<<>>( local_num_samples, local_sample_size, epsilon, local_input.LockedBuffer(), local_input.LDim(), local_output.Buffer(), local_output.LDim(), @@ -407,7 +407,7 @@ void bp_impl(lbann_comm& comm, grid_dims.x = (local_sample_size + block_size - 1) / block_size; grid_dims.y = local_num_samples; bp_statistics_grad_kernel - <<>>( + <<>>( local_num_samples, local_sample_size, epsilon, local_input.LockedBuffer(), local_input.LDim(), local_output_grad.LockedBuffer(), local_output_grad.LDim(), @@ -428,7 +428,7 @@ void bp_impl(lbann_comm& comm, grid_dims.x = (local_sample_size + block_size - 1) / block_size; grid_dims.y = local_num_samples; bp_input_grad_kernel - <<>>( + <<>>( sample_size, local_num_samples, local_sample_size, epsilon, local_input.LockedBuffer(), local_input.LDim(), local_output_grad.LockedBuffer(), local_output_grad.LDim(), diff --git a/src/layers/transform/concatenate.cu b/src/layers/transform/concatenate.cu index 0733cc9bb09..9b729f1606d 100644 --- a/src/layers/transform/concatenate.cu +++ b/src/layers/transform/concatenate.cu @@ -195,7 +195,7 @@ void fp_compute_impl( auto& output = l.get_activations(); auto& local_output = dynamic_cast(output.Matrix()); auto&& sync_info = El::SyncInfoFromMatrix(local_output); - auto&& stream = sync_info.stream_; + auto&& stream = sync_info.Stream(); // Get dimensions and strides for each input tensor const size_t num_inputs = l.get_num_parents(); @@ -348,7 +348,7 @@ void bp_compute_impl( const auto& output_grad = l.get_prev_error_signals(); auto& local_output_grad = dynamic_cast(output_grad.LockedMatrix()); auto&& sync_info = El::SyncInfoFromMatrix(local_output_grad); - auto&& stream = sync_info.stream_; + auto&& stream = sync_info.Stream(); // Get dimensions and strides for each input gradient tensor const size_t num_inputs = l.get_num_parents(); diff --git a/src/layers/transform/crop.cu b/src/layers/transform/crop.cu index b84e6d364c0..6029c179e94 100644 --- a/src/layers/transform/crop.cu +++ b/src/layers/transform/crop.cu @@ -185,7 +185,7 @@ void crop_layer::fp_compute_3d() { block_dims.x = block_size; grid_dims.x = (output_size + block_size - 1) / block_size; grid_dims.y = local_width; - fp_compute_3d_kernel<<>>( + fp_compute_3d_kernel<<>>( input_dims[2], input_dims[1], input_dims[0], output_dims[2], output_dims[1], output_dims[0], local_width, @@ -221,7 +221,7 @@ void crop_layer::bp_compute_3d() { block_dims.x = block_size; grid_dims.x = (output_size + block_size - 1) / block_size; grid_dims.y = local_width; - bp_compute_3d_kernel<<>>( + bp_compute_3d_kernel<<>>( input_dims[2], input_dims[1], input_dims[0], output_dims[2], output_dims[1], output_dims[0], local_width, diff --git a/src/layers/transform/evaluation.cpp b/src/layers/transform/evaluation.cpp index 10ffb3f0bf9..6ff4e773f00 100644 --- a/src/layers/transform/evaluation.cpp +++ b/src/layers/transform/evaluation.cpp @@ -103,8 +103,8 @@ void fp_gpu(lbann_comm& comm, ones_d.SetMemoryMode(1); // Use CUB GPU memory pool #endif // HYDROGEN_HAVE_CUB sum_d.Resize(1, 1); - auto&& handle = El::GPUManager::cuBLASHandle(); - auto&& stream = El::GPUManager::Stream(); + auto&& handle = hydrogen::cublas::GetLibraryHandle(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); CHECK_CUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE)); // Compute sum of local input matrix entries diff --git a/src/layers/transform/in_top_k.cu b/src/layers/transform/in_top_k.cu index 35d4e073bf6..1d02eca89db 100644 --- a/src/layers/transform/in_top_k.cu +++ b/src/layers/transform/in_top_k.cu @@ -188,8 +188,8 @@ void fp_gpu(lbann_comm& comm, const auto& col_comm_size = El::mpi::Size(col_comm); // GPU objects - auto&& stream = El::GPUManager::Stream(); - auto&& event = El::GPUManager::Event(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); + auto&& event = hydrogen::cuda::GetDefaultEvent(); cuda::thrust::allocator<> alloc(stream); // Find top-k entries in each column of local prediction matrix diff --git a/src/layers/transform/slice.cu b/src/layers/transform/slice.cu index f1e478632fa..cdba394b5d8 100644 --- a/src/layers/transform/slice.cu +++ b/src/layers/transform/slice.cu @@ -192,7 +192,7 @@ void fp_compute_impl( const auto& input = l.get_prev_activations(); const auto& local_input = dynamic_cast(input.LockedMatrix()); auto&& sync_info = El::SyncInfoFromMatrix(local_input); - auto&& stream = sync_info.stream_; + auto&& stream = sync_info.Stream(); // Get dimensions and strides for each output tensor const size_t num_outputs = l.get_num_children(); @@ -341,7 +341,7 @@ void bp_compute_impl( auto& input_grad = l.get_error_signals(); auto& local_input_grad = dynamic_cast(input_grad.Matrix()); auto&& sync_info = El::SyncInfoFromMatrix(local_input_grad); - auto&& stream = sync_info.stream_; + auto&& stream = sync_info.Stream(); // Get dimensions and strides for each output gradient tensor const size_t num_outputs = l.get_num_children(); diff --git a/src/layers/transform/sort.cu b/src/layers/transform/sort.cu index 4707459a456..987d3b24dce 100644 --- a/src/layers/transform/sort.cu +++ b/src/layers/transform/sort.cu @@ -48,7 +48,7 @@ void sort_layer::fp_compute() { const auto& local_width = local_input.Width(); // GPU objects - auto&& stream = El::GPUManager::Stream(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); cuda::thrust::allocator<> alloc(stream); // Sort each matrix column @@ -82,7 +82,7 @@ void sort_layer::bp_compute() { const auto& local_width = local_gradient_wrt_input.Width(); // GPU objects - auto&& stream = El::GPUManager::Stream(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); cuda::thrust::allocator<> alloc(stream); // Scatter gradients based on sorted indices diff --git a/src/layers/transform/split.cu b/src/layers/transform/split.cu index e964b2ee839..615a4dbe058 100644 --- a/src/layers/transform/split.cu +++ b/src/layers/transform/split.cu @@ -71,29 +71,29 @@ void split_distconv_adapter::bp_compute() { auto &error_signals = this->get_error_signals(0); switch (this->layer().get_num_children()) { case 0: - error_signals.zero(El::GPUManager::Stream()); + error_signals.zero(hydrogen::cuda::GetDefaultStream()); break; case 1: dc::tensor::Copy(error_signals, this->get_prev_error_signals(0), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); break; case 2: dc::tensor::Transform(error_signals, this->get_prev_error_signals(0), this->get_prev_error_signals(1), sum_op(), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); break; default: dc::tensor::Copy(error_signals, this->get_prev_error_signals(1), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); for (int i = 1; i < this->layer().get_num_children(); ++i) { const auto &prev_error = this->get_prev_error_signals(i); dc::tensor::Transform(error_signals, prev_error, accumulate_op(), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); } } return; diff --git a/src/layers/transform/sum.cu b/src/layers/transform/sum.cu index 4aaa819dd76..de5158a79a7 100644 --- a/src/layers/transform/sum.cu +++ b/src/layers/transform/sum.cu @@ -67,11 +67,11 @@ void sum_distconv_adapter::fp_compute() { auto &activations = this->get_activations(); switch (this->layer().get_num_parents()) { case 0: - activations.zero(El::GPUManager::Stream()); + activations.zero(hydrogen::cuda::GetDefaultStream()); break; case 1: dc::tensor::Copy(activations, this->get_prev_activations(), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); break; case 2: // Optimization for layers with 2 parents (e.g., @@ -82,7 +82,7 @@ void sum_distconv_adapter::fp_compute() { this->get_prev_activations(0), this->get_prev_activations(1), sum_op(), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); break; default: for (int i = 0; i < this->layer().get_num_parents(); ++i) { @@ -90,11 +90,11 @@ void sum_distconv_adapter::fp_compute() { prev_activations.set_outermost_dimension(activations.get_shape()[-1]); if (i == 0) { dc::tensor::Copy(activations, prev_activations, - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); } else { distconv::tensor::Transform(activations, prev_activations, accumulate_op(), - El::GPUManager::Stream()); + hydrogen::cuda::GetDefaultStream()); } } } diff --git a/src/layers/transform/tessellate.cu b/src/layers/transform/tessellate.cu index 771a225c3d9..c74d02dc4a5 100644 --- a/src/layers/transform/tessellate.cu +++ b/src/layers/transform/tessellate.cu @@ -130,7 +130,7 @@ void tessellate_layer const auto& local_width = local_output.Width(); const auto& block_size = 256; const auto& grid_size = (local_height * local_width + block_size - 1) / block_size; - fp_gpu_3d_kernel<<>>( + fp_gpu_3d_kernel<<>>( input_dims[0], input_dims[1], input_dims[2], output_dims[0], output_dims[1], output_dims[2], local_height, local_width, @@ -153,7 +153,7 @@ void tessellate_layer const auto& local_width = local_gradient_wrt_output.Width(); const auto& block_size = 256; const auto& grid_size = (local_height * local_width + block_size - 1) / block_size; - bp_gpu_3d_kernel<<>>( + bp_gpu_3d_kernel<<>>( input_dims[0], input_dims[1], input_dims[2], output_dims[0], output_dims[1], output_dims[2], local_height, local_width, diff --git a/src/models/model.cpp b/src/models/model.cpp index c3ea344fe6e..ce0ade8087e 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -594,6 +594,7 @@ void model::setup(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata) { } #ifdef LBANN_HAS_DISTCONV + m_max_mini_batch_size_distconv = max_mini_batch_size; setup_distconv(); #endif @@ -1330,7 +1331,7 @@ bool model::load_from_checkpoint_shared(persist& p) { // } p.set_restart_dir(trainer_dir); #ifdef LBANN_HAS_GPU - El::GPUManager::SynchronizeDevice(); + hydrogen::gpu::SynchronizeDevice(); #endif // LBANN_HAS_GPU return true; } diff --git a/src/objective_functions/weight_regularization/l2.cpp b/src/objective_functions/weight_regularization/l2.cpp index 36c94602e34..c73981c3627 100644 --- a/src/objective_functions/weight_regularization/l2.cpp +++ b/src/objective_functions/weight_regularization/l2.cpp @@ -119,7 +119,7 @@ void l2_weight_regularization::start_evaluation() { #ifdef LBANN_HAS_GPU // Compute contributions from GPU weights if (m_contributions.count(El::Device::GPU) > 0) { - auto&& stream = El::GPUManager::Stream(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); DMatType contribution; #ifdef HYDROGEN_HAVE_CUB contribution.SetMemoryMode(1); // CUB GPU memory pool diff --git a/src/objective_functions/weight_regularization/l2.cu b/src/objective_functions/weight_regularization/l2.cu index 7a823a9d8b6..dabe3c1c730 100644 --- a/src/objective_functions/weight_regularization/l2.cu +++ b/src/objective_functions/weight_regularization/l2.cu @@ -81,8 +81,8 @@ void l2_weight_regularization::accumulate_contribution(const El const auto& size = vals.Height() * vals.Width(); const El::Int block_size = 256; const auto& grid_size = (size + block_size - 1) / block_size; - auto&& stream = El::GPUManager::Stream(); - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + auto&& stream = hydrogen::cuda::GetDefaultStream(); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); accumulate_contribution_kernel <<>>( vals.Height(), vals.Width(), diff --git a/src/optimizers/adagrad.cu b/src/optimizers/adagrad.cu index 9325c5efdb3..22c5b4c6dec 100644 --- a/src/optimizers/adagrad.cu +++ b/src/optimizers/adagrad.cu @@ -65,7 +65,7 @@ void adagrad::step_compute_gpu(AbsDistMatrixType& values, if (local_size > 0) { constexpr size_t block_size = 256; const size_t grid_size = (local_size + block_size - 1) / block_size; - auto&& stream = El::GPUManager::Stream(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); adagrad_kernel<<>>( local_height, local_width, this->get_learning_rate(), m_eps, diff --git a/src/optimizers/adam.cpp b/src/optimizers/adam.cpp index 551920b4f94..8bb89b5a31e 100644 --- a/src/optimizers/adam.cpp +++ b/src/optimizers/adam.cpp @@ -158,6 +158,9 @@ void adam::step_compute_cpu(AbsDistMatrixType& values, for (size_t i = 0; i < local_size; ++i) { auto& x = values_buffer[i]; const auto& g = gradient_buffer[i] + m_eps; // Avoid denormalized floats + if (std::isinf(g) || std::isnan(g)) { + continue; + } auto& m1 = moment1_buffer[i]; auto& m2 = moment2_buffer[i]; m1 = m_beta1 * m1 + (one - m_beta1) * g; @@ -177,6 +180,9 @@ void adam::step_compute_cpu(AbsDistMatrixType& values, for (size_t row = 0; row < local_height; ++row) { auto& x = values_buffer[row+col*values_ldim]; const auto& g = gradient_buffer[row+col*gradient_ldim] + m_eps; // Avoid denormalized floats + if (std::isinf(g) || std::isnan(g)) { + continue; + } auto& m1 = moment1_buffer[row+col*moment1_ldim]; auto& m2 = moment2_buffer[row+col*moment2_ldim]; m1 = m_beta1 * m1 + (one - m_beta1) * g; diff --git a/src/optimizers/adam.cu b/src/optimizers/adam.cu index 6901a990ead..f4e67d3be0a 100644 --- a/src/optimizers/adam.cu +++ b/src/optimizers/adam.cu @@ -50,6 +50,9 @@ __global__ void adam_noncontiguous_kernel(size_t height, const auto& row = gid % height; const auto& col = gid / height; const auto& g = gradient[row + col * gradient_ldim] + eps; + if (cuda::isinf(g) || cuda::isnan(g)) { + return; + } auto& m1 = moment1[row + col * moment1_ldim]; auto& m2 = moment2[row + col * moment2_ldim]; auto& x = values[row + col * values_ldim]; @@ -72,6 +75,9 @@ __global__ void adam_contiguous_kernel(size_t size, const size_t gid = threadIdx.x + blockIdx.x * blockDim.x; if (gid < size) { const auto& g = gradient[gid] + eps; + if (cuda::isinf(g) || cuda::isnan(g)) { + return; + } auto& m1 = moment1[gid]; auto& m2 = moment2[gid]; auto& x = values[gid]; @@ -96,7 +102,7 @@ void adam::step_compute_gpu(AbsDistMatrixType& values, // Launch CUDA kernel constexpr size_t block_size = 256; const size_t grid_size = (local_size + block_size - 1) / block_size; - auto&& stream = El::GPUManager::Stream(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); if (values.Contiguous() && gradient.Contiguous() && m_moment1->Contiguous() && m_moment2->Contiguous()) { adam_contiguous_kernel<<>>( diff --git a/src/optimizers/rmsprop.cu b/src/optimizers/rmsprop.cu index e3820a4d22f..4f67dec1ff4 100644 --- a/src/optimizers/rmsprop.cu +++ b/src/optimizers/rmsprop.cu @@ -67,7 +67,7 @@ void rmsprop::step_compute_gpu(AbsDistMatrixType& values, if (local_size > 0) { constexpr size_t block_size = 256; const size_t grid_size = (local_size + block_size - 1) / block_size; - auto&& stream = El::GPUManager::Stream(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); rmsprop_kernel<<>>( local_height, local_width, this->get_learning_rate(), m_decay_rate, m_eps, diff --git a/src/optimizers/sgd.cu b/src/optimizers/sgd.cu index b33e54ee5d2..a65c51370cb 100644 --- a/src/optimizers/sgd.cu +++ b/src/optimizers/sgd.cu @@ -109,7 +109,7 @@ void sgd::momentum_step_gpu(AbsDistMatrixType& values, // Launch CUDA kernels for momentum SGD or NAG constexpr size_t block_size = 256; const size_t grid_size = (local_size + block_size - 1) / block_size; - auto&& stream = El::GPUManager::Stream(); + auto&& stream = hydrogen::cuda::GetDefaultStream(); if (m_nesterov) { nesterov_kernel<<>>( local_height, local_width, diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 6d9d52899b4..d7c1ad3085d 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -48,6 +48,7 @@ #include "lbann/layers/learning/embedding.hpp" #include "lbann/layers/learning/entrywise_scale_bias.hpp" #include "lbann/layers/learning/fully_connected.hpp" +#include "lbann/layers/learning/gru.hpp" #include "lbann/layers/learning/learning.hpp" #include "lbann/layers/loss/categorical_accuracy.hpp" #include "lbann/layers/loss/cross_entropy.hpp" @@ -167,6 +168,7 @@ class factory_manager LBANN_REGISTER_BUILDER(Embedding, embedding); LBANN_REGISTER_BUILDER(EntrywiseScaleBias, entrywise_scale_bias); LBANN_REGISTER_BUILDER(FullyConnected, fully_connected); + LBANN_REGISTER_BUILDER(GRU, gru); // Math layers LBANN_REGISTER_DEFAULT_BUILDER(Abs, abs); @@ -250,7 +252,6 @@ class factory_manager LBANN_REGISTER_DEFAULT_BUILDER(BooleanFalseNegative, boolean_false_negative); LBANN_REGISTER_DEFAULT_BUILDER(BooleanFalsePositive, boolean_false_positive); LBANN_REGISTER_DEFAULT_BUILDER(CategoricalAccuracy, categorical_accuracy); - LBANN_REGISTER_DEFAULT_BUILDER(CrossEntropy, cross_entropy); LBANN_REGISTER_DEFAULT_BUILDER(L1Norm, l1_norm); LBANN_REGISTER_DEFAULT_BUILDER(L2Norm2, l2_norm2); LBANN_REGISTER_DEFAULT_BUILDER(MeanAbsoluteError, mean_absolute_error); @@ -328,6 +329,7 @@ std::unique_ptr construct_layer_legacy( if (mode_str.empty() || mode_str == "classification") { target_mode = data_reader_target_mode::CLASSIFICATION; } if (mode_str == "regression") { target_mode = data_reader_target_mode::REGRESSION; } if (mode_str == "reconstruction") { target_mode = data_reader_target_mode::RECONSTRUCTION; } + if (mode_str == "label_reconstruction") { target_mode = data_reader_target_mode::LABEL_RECONSTRUCTION; } if (mode_str == "na" || mode_str == "NA" || mode_str == "N/A") { target_mode = data_reader_target_mode::NA; } if (Layout != data_layout::DATA_PARALLEL) { LBANN_ERROR("input layer is only supported with " @@ -655,6 +657,10 @@ std::unique_ptr construct_layer_legacy( } // Loss layers + if (proto_layer.has_cross_entropy()) { + const auto& params = proto_layer.cross_entropy(); + return lbann::make_unique>(comm, params.use_labels()); + } if (proto_layer.has_top_k_categorical_accuracy()) { const auto& params = proto_layer.top_k_categorical_accuracy(); return lbann::make_unique>(comm, params.k()); diff --git a/src/proto/layers.proto b/src/proto/layers.proto index be5f2883155..8ec39f1b446 100644 --- a/src/proto/layers.proto +++ b/src/proto/layers.proto @@ -103,6 +103,7 @@ message Layer { ChannelwiseScaleBias channelwise_scale_bias = 329; EntrywiseScaleBias entrywise_scale_bias = 330; ChannelwiseFullyConnected channelwise_fully_connected = 331; + GRU gru = 333; // Loss layers CrossEntropy cross_entropy = 60; @@ -315,7 +316,9 @@ message Layer { /////////////////////// // Loss layers // /////////////////////// - message CrossEntropy {} + message CrossEntropy { + bool use_labels = 1; //default: false + } message MeanSquaredError {} message MeanAbsoluteError {} message CategoricalAccuracy {} @@ -396,7 +399,7 @@ message Layer { ////////////////// message Input { string io_buffer = 2; // Options: "partitioned" (default) - string target_mode = 3; // Options: "classification" (default), "regression", "reconstruction", "N/A" + string target_mode = 3; // Options: "classification" (default), "regression", "reconstruction", "label_reconstruction", "N/A" } ////////////////////// @@ -662,6 +665,28 @@ message Layer { google.protobuf.BoolValue transpose = 3; } + /** @brief Gated recurrent unit + * + * Expects two inputs: a 2D input sequence ( + * @f$ \text{sequence\_length}\times\text{input\_size} @f$ ) + * and a 1D initial hidden state ( @f$ \text{hidden\_size} @f$ ). + * + * Uses four weights: "ih\_matrix" ( + * @f$ 3 \text{hidden\_size}\times\text{input\_size} @f$ ), + * "hh\_matrix" ( + * @f$ 3 \text{hidden\_size}\times\text{hidden\_size} @f$ ), + * "ih_bias" ( @f$ 3 \text{hidden\_size} @f$ ), + * "hh_bias" ( @f$ 3 \text{hidden\_size} @f$ ). + * + * @todo Support CPU + * @todo Support bidirectional RNNs + * @todo Support stacked RNNs + */ + message GRU { + /// Size of each hidden state and output vector + uint64 hidden_size = 1; + } + ////////////////// // Image layers // ////////////////// diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 6955e4e0a84..ed3196e4c2e 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -102,6 +102,8 @@ void init_data_readers( set_transform_pipeline = false; } else if ((name == "imagenet")) { init_image_data_reader(readme, pb_metadata, master, reader); + reader->set_data_sample_list(readme.sample_list()); + reader->keep_sample_order(readme.sample_list_keep_order()); set_transform_pipeline = false; } else if (name == "jag_conduit") { init_image_data_reader(readme, pb_metadata, master, reader); @@ -110,9 +112,10 @@ void init_data_readers( const lbann_data::Model& pb_model = p.model(); const lbann_data::Trainer& pb_trainer = p.trainer(); reader->set_mini_batch_size(static_cast(pb_trainer.mini_batch_size())); - reader->set_data_index_list(readme.index_list()); - reader_jag_conduit->set_list_per_trainer(readme.index_list_per_trainer()); - reader_jag_conduit->set_list_per_model(readme.index_list_per_model()); + reader->set_data_sample_list(readme.sample_list()); + reader_jag_conduit->set_list_per_trainer(readme.sample_list_per_trainer()); + reader_jag_conduit->set_list_per_model(readme.sample_list_per_model()); + reader_jag_conduit->keep_sample_order(readme.sample_list_keep_order()); /// Allow the prototext to control if the data readers is /// shareable for each phase training, validation, or testing @@ -183,6 +186,30 @@ void init_data_readers( reader_numpy_npz->set_has_responses(!readme.disable_responses()); reader_numpy_npz->set_scaling_factor_int16(readme.scaling_factor_int16()); reader = reader_numpy_npz; +#ifdef LBANN_HAS_DISTCONV + } else if (name == "cosmoflow_hdf5" || name == "hdf5") { + if(name == "cosmoflow_hdf5") { + LBANN_WARNING("The \"cosmoflow_hdf5\" data reader is deprecated. Use \"hdf5\" instead."); + } + const auto key_data = readme.hdf5_key_data(); + const auto key_labels = readme.hdf5_key_labels(); + const auto key_responses = readme.hdf5_key_responses(); + const auto hyperslab_labels = readme.hdf5_hyperslab_labels(); + auto* reader_hdf5 = new hdf5_reader(shuffle, key_data, + key_labels, + key_responses, + hyperslab_labels); + reader_hdf5->set_has_labels(!readme.disable_labels()); + reader_hdf5->set_has_responses(!readme.disable_responses()); + reader_hdf5->set_num_responses(readme.num_responses()); + auto filedir = readme.data_filedir(); + if(!endsWith(filedir, "/")) { + filedir = filedir + "/"; + } + const auto paths = glob(filedir +readme.data_file_pattern()); + reader_hdf5->set_hdf5_paths(paths); + reader = reader_hdf5; +#endif // LBANN_HAS_DISTCONV } else if (name == "pilot2_molecular_reader") { pilot2_molecular_reader* reader_pilot2_molecular = new pilot2_molecular_reader(readme.num_neighbors(), readme.max_neighborhood(), shuffle); reader = reader_pilot2_molecular; @@ -645,18 +672,18 @@ void set_data_readers_filenames( } } -void set_data_readers_index_list( +void set_data_readers_sample_list( const std::string& which, lbann_data::LbannPB& p) { options *opts = options::get(); lbann_data::DataReader *readers = p.mutable_data_reader(); int size = readers->reader_size(); - const std::string key_role = "index_list_" + which; + const std::string key_role = "sample_list_" + which; for (int j=0; jmutable_reader(j); if (r->role() == which) { - r->set_index_list(opts->get_string(key_role)); + r->set_sample_list(opts->get_string(key_role)); } } } @@ -679,7 +706,7 @@ void set_data_readers_percent(lbann_data::LbannPB& p) } } -void customize_data_readers_index_list(const lbann_comm& comm, lbann_data::LbannPB& p) +void customize_data_readers_sample_list(const lbann_comm& comm, lbann_data::LbannPB& p) { lbann_data::DataReader *readers = p.mutable_data_reader(); const lbann_data::Model& pb_model = p.model(); @@ -687,17 +714,26 @@ void customize_data_readers_index_list(const lbann_comm& comm, lbann_data::Lbann for (int j=0; jmutable_reader(j); std::ostringstream s; - std::string basename = get_basename_without_ext(r->index_list()); - std::string ext = get_ext_name(r->index_list()); - if(r->index_list_per_model()) { + std::string basename = get_basename_without_ext(r->sample_list()); + std::string ext = get_ext_name(r->sample_list()); + std::string dir = lbann::file::extract_parent_directory(r->sample_list()); + if ((r->sample_list()).empty()) { + continue; + } + if (dir.empty()) { + dir = "."; + } + + s << dir << '/'; + if(r->sample_list_per_model()) { s << pb_model.name() << "_"; } - if(r->index_list_per_trainer()) { + if(r->sample_list_per_trainer()) { s << "t" << comm.get_trainer_rank() << "_"; } s << basename; s << "." << ext; - r->set_index_list(s.str()); + r->set_sample_list(s.str()); } } @@ -726,17 +762,26 @@ void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p) or opts->has_string("label_filename_train")) { set_data_readers_filenames("train", p); } + if (opts->has_string("data_filedir") + or opts->has_string("data_filedir_validate") + or opts->has_string("data_filename_validate") + or opts->has_string("label_filename_validate")) { + set_data_readers_filenames("validate", p); + } if (opts->has_string("data_filedir") or opts->has_string("data_filedir_test") or opts->has_string("data_filename_test") or opts->has_string("label_filename_test")) { set_data_readers_filenames("test", p); } - if (opts->has_string("index_list_train")) { - set_data_readers_index_list("train", p); + if (opts->has_string("sample_list_train")) { + set_data_readers_sample_list("train", p); + } + if (opts->has_string("sample_list_validate")) { + set_data_readers_sample_list("validate", p); } - if (opts->has_string("index_list_test")) { - set_data_readers_index_list("test", p); + if (opts->has_string("sample_list_test")) { + set_data_readers_sample_list("test", p); } if (opts->has_string("data_reader_percent")) { set_data_readers_percent(p); @@ -917,7 +962,7 @@ void print_help(std::ostream& os) " sets the file directory for train and test data\n" " --data_filedir_train= --data_filedir_test=\n" " --data_filename_train= --data_filename_test=\n" - " --index_list_train= --index_list_test=\n" + " --sample_list_train= --sample_list_test=\n" " --label_filename_train= --label_filename_test=\n" " --data_reader_percent=\n" " --share_testing_data_readers=\n" diff --git a/src/proto/reader.proto b/src/proto/reader.proto index e06050aacff..ce01e29dd84 100644 --- a/src/proto/reader.proto +++ b/src/proto/reader.proto @@ -44,7 +44,7 @@ message Reader { string data_local_filedir = 50; //to support data_store string data_filename = 6; string label_filename = 7; - string index_list = 8; + string sample_list = 8; double validation_percent = 9; int64 absolute_sample_count = 11; int64 first_n = 200; @@ -84,14 +84,25 @@ message Reader { // 2 - there's a set of overlap indices that are common to all models //------------- end of only for partitioned data sets ------------------ - //------------- start of only for index lists ------------------ - bool index_list_per_trainer = 400; - bool index_list_per_model = 401; - //------------- end of only for index lists ------------------ + //------------- start of only for sample lists ------------------ + bool sample_list_per_trainer = 400; + bool sample_list_per_model = 401; + // For testing and validation, keep the loaded sample order same as that in the file + bool sample_list_keep_order = 402; + //------------- end of only for sample lists ------------------ PythonDataReader python = 501; repeated Transform transforms = 600; // Ordered list of transforms to apply. + + //------------- start of only for HDF5 data reader ------------------ + string hdf5_key_data = 700; + string hdf5_key_labels = 701; + string hdf5_key_responses = 702; + bool hdf5_hyperslab_labels = 703; + int32 num_responses = 704; + //------------- end of only for HDF5 data reader ------------------ + } message PythonDataReader { diff --git a/src/utils/cuda.cu b/src/utils/cuda.cu index 2838f6e1f58..dfd7f09e67e 100644 --- a/src/utils/cuda.cu +++ b/src/utils/cuda.cu @@ -31,9 +31,9 @@ namespace lbann { namespace cuda { -//////////////////////////////////////////////////////////// -// CUDA event wrapper -//////////////////////////////////////////////////////////// +// ------------------------------------------------------------- +// Utilities for CUDA events +// ------------------------------------------------------------- event_wrapper::event_wrapper() : m_event(nullptr), m_stream(0) { CHECK_CUDA(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming)); @@ -77,6 +77,149 @@ void event_wrapper::synchronize() { cudaEvent_t& event_wrapper::get_event() { return m_event; } +// ------------------------------------------------------------- +// Helper functions for tensor operations +// ------------------------------------------------------------- + +namespace { + +using int4 = cuda::array; + +/** + * Block dimensions: bdimx x bdimy x bdimz + * + * Grid dimensions: (dim[3] / bdimx) x (dim[2] / bdimy) x (dim[1] / bdimx) + */ +template +__global__ void copy_4d_kernel( + int4 dims, + const TensorDataType* __restrict__ input, + int4 input_strides, + TensorDataType* __restrict__ output, + int4 output_strides) { + + // Indices + const auto& gidx = threadIdx.x + blockIdx.x * blockDim.x; + const auto& gidy = threadIdx.y + blockIdx.y * blockDim.y; + const auto& gidz = threadIdx.z + blockIdx.z * blockDim.z; + const auto& nthreadsx = gridDim.x * blockDim.x; + const auto& nthreadsy = gridDim.y * blockDim.y; + const auto& nthreadsz = gridDim.z * blockDim.z; + + for (int i0=0; i0 + +template +void copy_tensor( + cudaStream_t stream, + const std::vector& dims, + const TensorDataType* input, + const std::vector& input_strides, + TensorDataType* output, + const std::vector& output_strides) { + + // Check inputs + if (dims.empty() || dims.size() > 4) { + LBANN_ERROR("invalid number of tensor dimensions (",dims.size(),")"); + } + if (dims.size() != input_strides.size()) { + LBANN_ERROR( + "number of input strides (",input_strides.size(),") ", + "does not match number of tensor dimensions (",dims.size(),")"); + } + if (dims.size() != output_strides.size()) { + LBANN_ERROR( + "number of output strides (",output_strides.size(),") ", + "does not match number of tensor dimensions (",dims.size(),")"); + } + + // Pad tensor dimensions to 4D + std::vector + rdims(dims.rbegin(), dims.rend()), + input_rstrides(input_strides.rbegin(), input_strides.rend()), + output_rstrides(output_strides.rbegin(), output_strides.rend()); + rdims.resize(4, 1); + input_rstrides.resize(4, input_rstrides.back()); + output_rstrides.resize(4, output_rstrides.back()); + + // Launch CUDA kernel + const auto size = std::accumulate( + dims.begin(), dims.end(), 1, std::multiplies()); + if (size > 0) { + constexpr size_t block_size = 64; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + block_dims.y = 1; + block_dims.z = 1; + grid_dims.x = (rdims[0] + block_dims.x - 1) / block_dims.x; + grid_dims.y = (rdims[1] + block_dims.y - 1) / block_dims.y; + grid_dims.z = (rdims[2] + block_dims.z - 1) / block_dims.z; + grid_dims.y = El::Min(grid_dims.y, 65535); + grid_dims.z = El::Min(grid_dims.z, 65535); + copy_4d_kernel<<>>( + {rdims[3], rdims[2], rdims[1], rdims[0]}, + input, + {input_rstrides[3], input_rstrides[2], + input_rstrides[1], input_rstrides[0]}, + output, + {output_rstrides[3], output_rstrides[2], + output_rstrides[1], output_rstrides[0]}); + } + +} + +#if defined(LBANN_HAS_HALF) && defined(LBANN_HAS_GPU_HALF) +template <> +void copy_tensor( + cudaStream_t stream, + const std::vector& dims, + const cpu_fp16* input, + const std::vector& input_strides, + cpu_fp16* output, + const std::vector& output_strides) { + copy_tensor( + stream, + dims, + reinterpret_cast(input), + input_strides, + reinterpret_cast(output), + output_strides); +} +#endif // defined(LBANN_HAS_HALF) && defined(LBANN_HAS_GPU_HALF) + +// Explicit template instantiation +#define PROTO(T) \ + template void copy_tensor( \ + cudaStream_t stream, \ + const std::vector& dims, \ + const T* input, \ + const std::vector& input_strides, \ + T* output, \ + const std::vector& output_strides); +#define LBANN_INSTANTIATE_GPU_HALF +#define LBANN_INSTANTIATE_CPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO + } // namespace cuda } // namespace lbann diff --git a/src/utils/cudnn.cpp b/src/utils/cudnn.cpp index 260fa2f002f..3f28a5e8c45 100644 --- a/src/utils/cudnn.cpp +++ b/src/utils/cudnn.cpp @@ -48,10 +48,10 @@ namespace { struct handle_wrapper { cudnnHandle_t handle; handle_wrapper() : handle(nullptr) { - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); if (handle == nullptr) { CHECK_CUDNN(cudnnCreate(&handle)); } if (handle == nullptr) { LBANN_ERROR("failed to create cuDNN handle"); } - CHECK_CUDNN(cudnnSetStream(handle, El::GPUManager::Stream())); + CHECK_CUDNN(cudnnSetStream(handle, hydrogen::cuda::GetDefaultStream())); } handle_wrapper(const handle_wrapper&) = delete; handle_wrapper& operator=(const handle_wrapper&) = delete; @@ -75,9 +75,9 @@ void destroy() { cudnnHandle_t& get_handle() { if (!handle_instance) { initialize(); } - CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); + CHECK_CUDA(cudaSetDevice(hydrogen::gpu::DefaultDevice())); CHECK_CUDNN(cudnnSetStream(handle_instance->handle, - El::GPUManager::Stream())); + hydrogen::cuda::GetDefaultStream())); return handle_instance->handle; } @@ -241,6 +241,446 @@ void copy_activation_desc(const cudnnActivationDescriptor_t& src, } +//////////////////////////////////////////////////////////// +// Wrapper classes for cuDNN types +//////////////////////////////////////////////////////////// + +// ----------------------------- +// TensorDescriptor +// ----------------------------- + +TensorDescriptor::TensorDescriptor(cudnnTensorDescriptor_t desc) + : desc_{desc} +{} + +TensorDescriptor::~TensorDescriptor() { + if (desc_) { + // Don't check status to avoid exceptions + cudnnDestroyTensorDescriptor(desc_); + } +} + +TensorDescriptor::TensorDescriptor(const TensorDescriptor& other) { + if (other.desc_) { + cudnnDataType_t data_type; + int num_dims; + CHECK_CUDNN( + cudnnGetTensorNdDescriptor( + other.desc_, + 0, // nbDimsRequested + &data_type, + &num_dims, + nullptr, // dimA + nullptr)); // strideA + std::vector dims(num_dims), strides(num_dims); + CHECK_CUDNN( + cudnnGetTensorNdDescriptor( + other.desc_, + num_dims, + &data_type, + &num_dims, + dims.data(), + strides.data())); + set(data_type, dims, strides); + } +} + +TensorDescriptor::TensorDescriptor(TensorDescriptor&& other) + : desc_{other.desc_} { + other.desc_ = nullptr; +} + +TensorDescriptor& TensorDescriptor::operator=(TensorDescriptor other) { + swap(other, *this); + return *this; +} + +void swap(TensorDescriptor& first, TensorDescriptor& second) { + std::swap(first.desc_, second.desc_); +} + +void TensorDescriptor::reset(cudnnTensorDescriptor_t desc) { + if (desc_) { + CHECK_CUDNN(cudnnDestroyTensorDescriptor(desc_)); + } + desc_ = desc; +} + +cudnnTensorDescriptor_t TensorDescriptor::release() { + auto old_desc = desc_; + desc_ = nullptr; + return old_desc; +} + +cudnnTensorDescriptor_t TensorDescriptor::get() const noexcept { + return desc_; +} + +TensorDescriptor::operator cudnnTensorDescriptor_t() const noexcept { + return get(); +} + +void TensorDescriptor::create() { + if (!desc_) { + CHECK_CUDNN(cudnnCreateTensorDescriptor(&desc_)); + } +} + +void TensorDescriptor::set( + cudnnDataType_t data_type, + const std::vector& dims, + std::vector strides) { + + // Check that arguments are valid + if (dims.empty()) { + LBANN_ERROR("attempted to set cuDNN tensor descriptor with no dimensions"); + } + if (dims.size() < 3) { + // As of cuDNN 7.65, cuDNN does not support tensors with <3 dims + LBANN_ERROR( + "attempted to set cuDNN tensor descriptor with fewer than 3 dimensions"); + } + if (!strides.empty() && dims.size() != strides.size()) { + LBANN_ERROR( + "attempted to set cuDNN tensor descriptor ", + "with mismatched dimensions (",dims.size(),") ", + "and strides (",strides.size(),")"); + } + + // Assume data is contiguous if no strides are provided + if (strides.empty()) { + strides.resize(dims.size(), 1); + for (int i=strides.size()-1; i>0; --i) { + strides[i-1] = strides[i] * dims[i]; + } + } + + // Set cuDNN object + create(); + CHECK_CUDNN( + cudnnSetTensorNdDescriptor( + desc_, + data_type, + dims.size(), + dims.data(), + strides.data())); + +} + +// ----------------------------- +// FilterDescriptor +// ----------------------------- + +FilterDescriptor::FilterDescriptor(cudnnFilterDescriptor_t desc) + : desc_{desc} +{} + +FilterDescriptor::~FilterDescriptor() { + if (desc_) { + // Don't check status to avoid exceptions + cudnnDestroyFilterDescriptor(desc_); + } +} + +FilterDescriptor::FilterDescriptor(const FilterDescriptor& other) { + if (other.desc_) { + int num_dims; + cudnnDataType_t data_type; + cudnnTensorFormat_t format; + CHECK_CUDNN( + cudnnGetFilterNdDescriptor( + other.desc_, + 0, // nbDimsRequested + &data_type, + &format, + &num_dims, + nullptr)); // filterDimA + std::vector dims(num_dims); + CHECK_CUDNN( + cudnnGetFilterNdDescriptor( + other.desc_, + num_dims, + &data_type, + &format, + &num_dims, + dims.data())); + set(data_type, format, dims); + } +} + +FilterDescriptor::FilterDescriptor(FilterDescriptor&& other) + : desc_{other.desc_} { + other.desc_ = nullptr; +} + +FilterDescriptor& FilterDescriptor::operator=(FilterDescriptor other) { + swap(other, *this); + return *this; +} + +void swap(FilterDescriptor& first, FilterDescriptor& second) { + std::swap(first.desc_, second.desc_); +} + +void FilterDescriptor::reset(cudnnFilterDescriptor_t desc) { + if (desc_) { + CHECK_CUDNN(cudnnDestroyFilterDescriptor(desc_)); + } + desc_ = desc; +} + +cudnnFilterDescriptor_t FilterDescriptor::release() { + auto old_desc = desc_; + desc_ = nullptr; + return old_desc; +} + +cudnnFilterDescriptor_t FilterDescriptor::get() const noexcept { + return desc_; +} + +FilterDescriptor::operator cudnnFilterDescriptor_t() const noexcept { + return get(); +} + +void FilterDescriptor::create() { + if (!desc_) { + CHECK_CUDNN(cudnnCreateFilterDescriptor(&desc_)); + } +} + +void FilterDescriptor::set( + cudnnDataType_t data_type, + cudnnTensorFormat_t format, + const std::vector& dims) { + create(); + CHECK_CUDNN( + cudnnSetFilterNdDescriptor( + desc_, + data_type, + format, + dims.size(), + dims.data())); +} + +// ----------------------------- +// DropoutDescriptor +// ----------------------------- + +DropoutDescriptor::DropoutDescriptor(cudnnDropoutDescriptor_t desc) + : desc_{desc} +{} + +DropoutDescriptor::~DropoutDescriptor() { + if (desc_) { + // Don't check status to avoid exceptions + cudnnDestroyDropoutDescriptor(desc_); + } +} + +DropoutDescriptor::DropoutDescriptor(const DropoutDescriptor& other) { + if (other.desc_) { + float dropout; + void* states; + size_t states_size; + unsigned long long seed; + CHECK_CUDNN(cudnnDropoutGetStatesSize(get_handle(), &states_size)); + CHECK_CUDNN( + cudnnGetDropoutDescriptor( + other.desc_, + get_handle(), + &dropout, + &states, + &seed)); + set(dropout, states, states_size, seed); + } +} + +DropoutDescriptor::DropoutDescriptor(DropoutDescriptor&& other) + : desc_{other.desc_} { + other.desc_ = nullptr; +} + +DropoutDescriptor& DropoutDescriptor::operator=(DropoutDescriptor other) { + swap(other, *this); + return *this; +} + +void swap(DropoutDescriptor& first, DropoutDescriptor& second) { + std::swap(first.desc_, second.desc_); +} + +void DropoutDescriptor::reset(cudnnDropoutDescriptor_t desc) { + if (desc_) { + CHECK_CUDNN(cudnnDestroyDropoutDescriptor(desc_)); + } + desc_ = desc; +} + +cudnnDropoutDescriptor_t DropoutDescriptor::release() { + auto old_desc = desc_; + desc_ = nullptr; + return old_desc; +} + +cudnnDropoutDescriptor_t DropoutDescriptor::get() const noexcept { + return desc_; +} + +DropoutDescriptor::operator cudnnDropoutDescriptor_t() const noexcept { + return get(); +} + +void DropoutDescriptor::create() { + if (!desc_) { + CHECK_CUDNN(cudnnCreateDropoutDescriptor(&desc_)); + } +} + +void DropoutDescriptor::set( + float dropout, + void* states, + size_t states_size, + unsigned long long seed) { + create(); + CHECK_CUDNN( + cudnnSetDropoutDescriptor( + desc_, + get_handle(), + dropout, + states, + states_size, + seed)); +} + +// ----------------------------- +// RNNDescriptor +// ----------------------------- + +RNNDescriptor::RNNDescriptor(cudnnRNNDescriptor_t desc) + : desc_{desc} +{} + +RNNDescriptor::~RNNDescriptor() { + if (desc_) { + // Don't check status to avoid exceptions + cudnnDestroyRNNDescriptor(desc_); + } +} + +RNNDescriptor::RNNDescriptor(const RNNDescriptor& other) { + if (other.desc_) { + int hidden_size, num_layers; + cudnnDropoutDescriptor_t dropout_desc; + cudnnRNNInputMode_t input_mode; + cudnnDirectionMode_t direction; + cudnnRNNMode_t mode; + cudnnRNNAlgo_t algo; + cudnnDataType_t math_precision; +#if CUDNN_VERSION >= 8000 + CHECK_CUDNN( + cudnnGetRNNDescriptor_v6( + get_handle(), + other.desc_, + &hidden_size, + &num_layers, + &dropout_desc, + &input_mode, + &direction, + &mode, + &algo, + &math_precision)); +#else // CUDNN_VERSION < 8000 + CHECK_CUDNN( + cudnnGetRNNDescriptor( + get_handle(), + other.desc_, + &hidden_size, + &num_layers, + &dropout_desc, + &input_mode, + &direction, + &mode, + &algo, + &math_precision)); +#endif // CUDNN_VERSION >= 8000 + set( + hidden_size, + num_layers, + dropout_desc, + input_mode, + direction, + mode, + algo, + math_precision); + } +} + +RNNDescriptor::RNNDescriptor(RNNDescriptor&& other) + : desc_{other.desc_} { + other.desc_ = nullptr; +} + +RNNDescriptor& RNNDescriptor::operator=(RNNDescriptor other) { + swap(other, *this); + return *this; +} + +void swap(RNNDescriptor& first, RNNDescriptor& second) { + std::swap(first.desc_, second.desc_); +} + +void RNNDescriptor::reset(cudnnRNNDescriptor_t desc) { + if (desc_) { + CHECK_CUDNN(cudnnDestroyRNNDescriptor(desc_)); + } + desc_ = desc; +} + +cudnnRNNDescriptor_t RNNDescriptor::release() { + auto old_desc = desc_; + desc_ = nullptr; + return old_desc; +} + +cudnnRNNDescriptor_t RNNDescriptor::get() const noexcept { + return desc_; +} + +RNNDescriptor::operator cudnnRNNDescriptor_t() const noexcept { + return get(); +} + +void RNNDescriptor::create() { + if (!desc_) { + CHECK_CUDNN(cudnnCreateRNNDescriptor(&desc_)); + } +} + +void RNNDescriptor::set( + size_t hidden_size, + size_t num_layers, + cudnnDropoutDescriptor_t dropout_desc, + cudnnRNNInputMode_t input_mode, + cudnnDirectionMode_t direction, + cudnnRNNMode_t mode, + cudnnRNNAlgo_t algo, + cudnnDataType_t math_precision) { + create(); + CHECK_CUDNN( + cudnnSetRNNDescriptor_v6( + get_handle(), + desc_, + hidden_size, + num_layers, + dropout_desc, + input_mode, + direction, + mode, + algo, + math_precision)); +} + //////////////////////////////////////////////////////////// // Base cuDNN tensor manager //////////////////////////////////////////////////////////// diff --git a/src/utils/distconv.cpp b/src/utils/distconv.cpp index f20923726fb..34fcf3a6e8e 100644 --- a/src/utils/distconv.cpp +++ b/src/utils/distconv.cpp @@ -289,12 +289,12 @@ void initialize(MPI_Comm comm) { p2p_instance = new p2p::P2P(mpi_comm); #endif // DISTCONV_HAS_P2P mpicuda_comm_instance = new Al::mpicuda_backend::comm_type( - mpi_comm, El::GPUManager::Stream()); + mpi_comm, hydrogen::cuda::GetDefaultStream()); ::distconv::cudnn::Options backend_opts; backend_opts.m_deterministic = opt_deterministic; backend_instance = new Backend( mpi_comm, lbann::cudnn::get_handle(), - El::GPUManager::Stream(), backend_opts); + hydrogen::cuda::GetDefaultStream(), backend_opts); print_options(std::cout); initialized = true; } diff --git a/src/utils/file_utils.cpp b/src/utils/file_utils.cpp index b2c806f9ac5..4bb9b1bbb31 100644 --- a/src/utils/file_utils.cpp +++ b/src/utils/file_utils.cpp @@ -172,7 +172,7 @@ bool create_dir(const std::string dirname) { } /// Load a file into a buffer -bool load_file(const std::string filename, std::vector& buf) { +bool load_file(const std::string filename, std::vector& buf, bool append) { std::ifstream file(filename, std::ios::binary); if (!file.good()) { return false; @@ -181,13 +181,17 @@ bool load_file(const std::string filename, std::vector& buf) { file.unsetf(std::ios::skipws); file.seekg(0, std::ios::end); - const std::streampos file_size = file.tellg(); + const std::streamsize file_size = static_cast(file.tellg()); file.seekg(0, std::ios::beg); - buf.resize(file_size); + if (!append) { + buf.clear(); + } + const size_t cur_size = buf.size(); + buf.resize(static_cast(file_size) + cur_size); - file.read(buf.data(), file_size); + file.read(buf.data() + cur_size, file_size); return true; } diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index 53a0895c312..62937b58638 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -194,8 +194,8 @@ std::unique_ptr construct_trainer(lbann_comm *comm, std::vector data_seq_random_seeds(comm->get_procs_in_world()); comm->world_all_gather(data_seq_random_seed, data_seq_random_seeds); - // Update the index lists to accomodate multi-trainer / multi-model specification - customize_data_readers_index_list(*comm, pb); + // Update the sample lists to accomodate multi-trainer / multi-model specification + customize_data_readers_sample_list(*comm, pb); // Initialize data readers //@todo: code not in place for correctly handling image preprocessing @@ -392,7 +392,7 @@ void print_lbann_configuration(lbann_comm *comm, int io_threads_per_process, int << " I/O threads per process (+offset) : " << io_threads_per_process << " (+" << io_threads_offset << ")" << std::endl; #ifdef HYDROGEN_HAVE_CUDA - std::cout << " GPUs on node : " << El::GPUManager::NumDevices() << std::endl; + std::cout << " GPUs on node : " << hydrogen::gpu::DeviceCount() << std::endl; #endif // HYDROGEN_HAVE_CUDA std::cout << std::endl; diff --git a/src/utils/profiling.cpp b/src/utils/profiling.cpp index cc237411778..e0f26b2c137 100644 --- a/src/utils/profiling.cpp +++ b/src/utils/profiling.cpp @@ -73,7 +73,7 @@ void prof_stop() { void prof_region_begin(const char *s, int c, bool sync) { if (!profiling_started) return; if (sync) { - El::GPUManager::SynchronizeDevice(); + hydrogen::gpu::SynchronizeDevice(); } // Doesn't work with gcc 4.9 // nvtxEventAttributes_t ev = {0}; @@ -90,7 +90,7 @@ void prof_region_begin(const char *s, int c, bool sync) { void prof_region_end(const char *, bool sync) { if (!profiling_started) return; if (sync) { - El::GPUManager::SynchronizeDevice(); + hydrogen::gpu::SynchronizeDevice(); } nvtxRangePop(); } diff --git a/src/utils/threads/thread_topology.cpp b/src/utils/threads/thread_topology.cpp index a2c52e1228d..362d6151812 100644 --- a/src/utils/threads/thread_topology.cpp +++ b/src/utils/threads/thread_topology.cpp @@ -156,7 +156,7 @@ hwloc_cpuset_t get_local_cpuset_for_current_thread(hwloc_topology_t topo) { hwloc_cpuset_t local_cpuset = hwloc_bitmap_alloc(); #ifdef LBANN_HAS_GPU // Find CPUs close to the GPU being used - hwloc_cudart_get_device_cpuset(topo, hydrogen::GPUManager::Device(), local_cpuset); + hwloc_cudart_get_device_cpuset(topo, hydrogen::gpu::DefaultDevice(), local_cpuset); #else hwloc_const_cpuset_t allowed_cpuset = hwloc_topology_get_allowed_cpuset(topo); local_cpuset = hwloc_bitmap_dup(allowed_cpuset); diff --git a/src/utils/unit_test/hash_test.cpp b/src/utils/unit_test/hash_test.cpp index 7de4b81802a..4683853523b 100644 --- a/src/utils/unit_test/hash_test.cpp +++ b/src/utils/unit_test/hash_test.cpp @@ -10,7 +10,7 @@ TEST_CASE ("Testing convenience functions for hashing", "[hash][utilities]") { SECTION ("hash_combine") { std::unordered_set hashes; - for (size_t seed=0; seed<10; ++seed) { + for (size_t seed=0; seed<=16; seed+=2) { hashes.insert(seed); } for (size_t seed=0; seed<=16; seed+=2) { @@ -27,21 +27,29 @@ TEST_CASE ("Testing convenience functions for hashing", "[hash][utilities]") { std::vector enum_list = { Humor::MELANCHOLIC, Humor::SANGUINE, Humor::CHOLERIC, Humor::PHLEGMATIC }; std::unordered_set hashes; - for (size_t i=0; i()(enum_list[i]); + for (const auto val : enum_list) { + const auto hash = lbann::enum_hash()(val); CHECK_FALSE(hashes.count(hash)); hashes.insert(hash); } } SECTION ("pair_hash") { + const std::vector i_list = {1, 2, 1018, 1019, + 11209, 543210, 4294967295}; + const std::vector j_list = {-12.34f, -8.76f, -4.56f, + 0.f, 4.56f, 8.76f, 12.34f}; std::unordered_set hashes; - for (char i=-12; i<=12; i+=3) { - for (unsigned long j=0; j<=11209; j+=1019) { - std::pair val(i,j); - const auto hash = lbann::pair_hash()(val); - CHECK_FALSE(hashes.count(hash)); - hashes.insert(hash); + for (const auto i : i_list) { + for (const auto j : j_list) { + std::pair val1(i,j); + const auto hash1 = lbann::pair_hash()(val1); + CHECK_FALSE(hashes.count(hash1)); + hashes.insert(hash1); + std::pair val2(j,i); + const auto hash2 = lbann::pair_hash()(val2); + CHECK_FALSE(hashes.count(hash2)); + hashes.insert(hash2); } } } diff --git a/src/weights/initializer.cpp b/src/weights/initializer.cpp index 8e3bf294dd3..a941909e01a 100644 --- a/src/weights/initializer.cpp +++ b/src/weights/initializer.cpp @@ -95,7 +95,7 @@ void value_initializer::fill(AbsDistMatrixType& matrix) { if (matrix.GetLocalDevice() != El::Device::CPU) { El::Copy(matrix_cpu, matrix.Matrix()); #ifdef HYDROGEN_HAVE_CUDA - El::GPUManager::SynchronizeStream(); /// @todo Use new Hydrogen synchronization semantics when available + Synchronize(hydrogen::gpu::DefaultSyncInfo()); /// @todo Use new Hydrogen synchronization semantics when available #endif // HYDROGEN_HAVE_CUDA } diff --git a/superbuild/aluminum/CMakeLists.txt b/superbuild/aluminum/CMakeLists.txt index 75ac7faf73a..b97e8043057 100644 --- a/superbuild/aluminum/CMakeLists.txt +++ b/superbuild/aluminum/CMakeLists.txt @@ -11,7 +11,7 @@ else () CACHE STRING "The URL from which to clone Aluminum") endif () -set(ALUMINUM_TAG "v0.3.3" +set(ALUMINUM_TAG "v0.4.0" CACHE STRING "The git tag to checkout for Aluminum") set(ALUMINUM_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" diff --git a/superbuild/hydrogen/CMakeLists.txt b/superbuild/hydrogen/CMakeLists.txt index bda354252b9..15107a2890a 100644 --- a/superbuild/hydrogen/CMakeLists.txt +++ b/superbuild/hydrogen/CMakeLists.txt @@ -109,7 +109,7 @@ else () endif () # ... then the tag. -set(HYDROGEN_TAG "v1.3.4" +set(HYDROGEN_TAG "v1.4.0" CACHE STRING "The git tag or hash to checkout for Hydrogen") if (HYDROGEN_CUSTOM_SOURCE_DIR)