diff --git a/experiments/run_lbann_dnn_multi_mnist.sh b/experiments/run_lbann_dnn_multi_mnist.sh index 4f511673dbf..6a2bf780766 100755 --- a/experiments/run_lbann_dnn_multi_mnist.sh +++ b/experiments/run_lbann_dnn_multi_mnist.sh @@ -25,6 +25,9 @@ ACT=1 LRM=1 TEST_W_TRAIN_DATA=0 LR_DECAY=0.5 +PROCS_PER_MODEL= +SUMMARY_DIR= +IMCOMM= RUN="srun" @@ -65,6 +68,7 @@ function HELP { echo "${REV}-f${NORM} --Path to the ${BOLD}datasets${NORM}. Default is ${BOLD}${ROOT_DATASET_DIR}${NORM}." echo "${REV}-i${NORM} --Sets the ${BOLD}parallel I/O limit${NORM}. Default is ${BOLD}${PARIO}${NORM}." echo "${REV}-j${NORM} --Sets the ${BOLD}learning rate decay${NORM}. Default is ${BOLD}${LR_DECAY}${NORM}." + echo "${REV}-k${NORM} --Sets the ${BOLD}number of processes per model${NORM}. 0 for one model." echo "${REV}-l${NORM} --Determines if the model is ${BOLD}loaded${NORM}. Default is ${BOLD}${LOAD_MODEL}${NORM}." echo "${REV}-m${NORM} --Sets the ${BOLD}mode${NORM}. Default is ${BOLD}${MODE}${NORM}." echo "${REV}-n${NORM} --Sets the ${BOLD}network topology${NORM}. Default is ${BOLD}${NETWORK}${NORM}." @@ -76,12 +80,14 @@ function HELP { echo "${REV}-t${NORM} --Sets the number of ${BOLD}training samples${NORM}. Default is ${BOLD}${TRAINING_SAMPLES}${NORM}." echo "${REV}-u${NORM} --Use the ${BOLD}Lustre filesystem${NORM} directly. Default is ${BOLD}${USE_LUSTRE_DIRECT}${NORM}." echo "${REV}-v${NORM} --Sets the number of ${BOLD}validation samples${NORM}. Default is ${BOLD}${VALIDATION_SAMPLES}${NORM}." + echo "${REV}-w${NORM} --Sets the ${BOLD}summary output directory${NORM}." + echo "${REV}-x${NORM} --Sets the type of ${BOLD}intermodel communication${NORM}." echo "${REV}-z${NORM} --Sets the ${BOLD}tasks per node${NORM}. Default is ${BOLD}${TASKS_PER_NODE}${NORM}." echo -e "${REV}-h${NORM} --Displays this help message. No further functions are performed."\\n exit 1 } -while getopts ":a:b:cde:f:hi:j:l:m:n:o:p:q:r:s:t:uv:z:" opt; do +while getopts ":a:b:cde:f:hi:j:k:l:m:n:o:p:q:r:s:t:uv:w:x:z:" opt; do case $opt in a) ACT=$OPTARG @@ -111,6 +117,9 @@ while getopts ":a:b:cde:f:hi:j:l:m:n:o:p:q:r:s:t:uv:z:" opt; do j) LR_DECAY=$OPTARG ;; + k) + PROCS_PER_MODEL="--procs-per-model $OPTARG" + ;; l) LOAD_MODEL=$OPTARG ;; @@ -144,6 +153,12 @@ while getopts ":a:b:cde:f:hi:j:l:m:n:o:p:q:r:s:t:uv:z:" opt; do v) VALIDATION_SAMPLES=$OPTARG ;; + w) + SUMMARY_DIR="--summary-dir $OPTARG" + ;; + x) + IMCOMM="--imcomm $OPTARG" + ;; z) TASKS_PER_NODE=$OPTARG ;; @@ -233,7 +248,7 @@ fi fi -CMD="${RUN} -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE} ${BINDIR}/lbann_dnn_multi_mnist --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --lr-decay-rate ${LR_DECAY} --lambda 0.1" +CMD="${RUN} -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE} ${BINDIR}/lbann_dnn_multi_mnist --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --lr-decay-rate ${LR_DECAY} --lambda 0.1 ${SUMMARY_DIR} ${IMCOMM} ${PROCS_PER_MODEL}" #CMD="${RUN} -N1 -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE} --distribution=block --drop-caches=pagecache ${DIRNAME}/lbann_dnn_mnist --par-IO ${PARIO} --dataset ${ROOT_DATASET_DIR}/${DATASET_DIR}/ --max-validation-samples ${VALIDATION_SAMPLES} --profiling true --max-training-samples ${TRAINING_SAMPLES} --block-size ${BLOCK_SIZE} --output ${OUTPUT_DIR} --mode ${MODE} --num-epochs ${EPOCHS} --params ${PARAM_DIR} --save-model ${SAVE_MODEL} --load-model ${LOAD_MODEL} --mb-size ${MB_SIZE} --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --lr-decay-rate ${LR_DECAY}" echo ${CMD} ${CMD} diff --git a/include/lbann/lbann_params.hpp b/include/lbann/lbann_params.hpp index abaf5922cfd..3491c5f0046 100644 --- a/include/lbann/lbann_params.hpp +++ b/include/lbann/lbann_params.hpp @@ -102,7 +102,13 @@ namespace lbann std::string TrainFile; /// Test data file name std::string TestFile; + /// Location to write summary files. + std::string SummaryDir; + /// Type of intermodel communication to use, if any. + int IntermodelCommMethod; + /// Number of processes to use in each model (if using multiple). + int ProcsPerModel; }; /// Performance parameters diff --git a/model_zoo/lbann_dnn_imagenet.cpp b/model_zoo/lbann_dnn_imagenet.cpp index ea6dc242652..f57b15acfe4 100644 --- a/model_zoo/lbann_dnn_imagenet.cpp +++ b/model_zoo/lbann_dnn_imagenet.cpp @@ -619,6 +619,7 @@ int main(int argc, char* argv[]) TrainingParams trainParams; trainParams.DatasetRootDir = "/p/lscratchf/brainusr/datasets/ILSVRC2012/"; trainParams.DropOut = 0.1; + trainParams.ProcsPerModel = 0; trainParams.parse_params(); PerformanceParams perfParams; perfParams.parse_params(); @@ -657,7 +658,7 @@ int main(int argc, char* argv[]) double sec_all_val = 0; // Set up the communicator and get the grid. - comm = new lbann_comm(); + comm = new lbann_comm(trainParams.ProcsPerModel); Grid& grid = comm->get_model_grid(); if (comm->am_world_master()) { cout << "Number of models: " << comm->get_num_models() << endl; diff --git a/model_zoo/lbann_dnn_mnist.cpp b/model_zoo/lbann_dnn_mnist.cpp index bd3f0889570..1bd6da5827e 100644 --- a/model_zoo/lbann_dnn_mnist.cpp +++ b/model_zoo/lbann_dnn_mnist.cpp @@ -71,6 +71,7 @@ int main(int argc, char* argv[]) trainParams.MBSize = 10; trainParams.LearnRate = 0.0001; trainParams.DropOut = -1.0f; + trainParams.ProcsPerModel = 0; PerformanceParams perfParams; perfParams.BlockSize = 256; @@ -84,7 +85,7 @@ int main(int argc, char* argv[]) SetBlocksize(perfParams.BlockSize); // Set up the communicator and get the grid. - lbann_comm* comm = new lbann_comm(); + lbann_comm* comm = new lbann_comm(trainParams.ProcsPerModel); Grid& grid = comm->get_model_grid(); if (comm->am_world_master()) { cout << "Number of models: " << comm->get_num_models() << endl; diff --git a/model_zoo/lbann_dnn_multi_mnist.cpp b/model_zoo/lbann_dnn_multi_mnist.cpp index 34caea0938b..0e3969765ad 100644 --- a/model_zoo/lbann_dnn_multi_mnist.cpp +++ b/model_zoo/lbann_dnn_multi_mnist.cpp @@ -65,6 +65,9 @@ int main(int argc, char* argv[]) trainParams.MBSize = 10; trainParams.LearnRate = 0.0001; trainParams.DropOut = -1.0f; + trainParams.ProcsPerModel = 12; // Use one Catalyst node. + trainParams.IntermodelCommMethod = static_cast( + lbann_callback_imcomm::COMPRESSED_ADAPTIVE_THRESH_QUANTIZATION); PerformanceParams perfParams; perfParams.BlockSize = 256; @@ -79,10 +82,11 @@ int main(int argc, char* argv[]) SetBlocksize(perfParams.BlockSize); // Set up the communicator and get the grid. - lbann_comm* comm = new lbann_comm(12); + lbann_comm* comm = new lbann_comm(trainParams.ProcsPerModel); Grid& grid = comm->get_model_grid(); if (comm->am_world_master()) { - cout << "Number of models: " << comm->get_num_models() << endl; + cout << "Number of models: " << comm->get_num_models() << + " (" << comm->get_procs_per_model() << " procs per model)" << endl; cout << "Grid is " << grid.Height() << " x " << grid.Width() << endl; cout << endl; } @@ -151,13 +155,13 @@ int main(int argc, char* argv[]) uint fcidx2 = dnn.add( "FullyConnected", 30, trainParams.ActivationType, {new dropout(trainParams.DropOut)}); - dnn.add("SoftMax", 10); + uint smidx = dnn.add("SoftMax", 10); target_layer *target_layer = new target_layer_distributed_minibatch( comm, (int) trainParams.MBSize, &mnist_trainset, &mnist_testset, true); //target_layer *target_layer = new target_layer_distributed_minibatch_parallel_io(comm, parallel_io, (int) trainParams.MBSize, &mnist_trainset, &mnist_testset, true); dnn.add(target_layer); - lbann_summary summarizer("/p/lscratchf/dryden1", comm); + lbann_summary summarizer(trainParams.SummaryDir, comm); // Print out information for each epoch. lbann_callback_print print_cb; dnn.add_callback(&print_cb); @@ -169,8 +173,9 @@ int main(int argc, char* argv[]) dnn.add_callback(&summary_cb); // Do global inter-model updates. lbann_callback_imcomm imcomm_cb( - lbann_callback_imcomm::COMPRESSED_ADAPTIVE_THRESH_QUANTIZATION, - {fcidx1, fcidx2}, &summarizer); + static_cast( + trainParams.IntermodelCommMethod), + {fcidx1, fcidx2, smidx}, &summarizer); dnn.add_callback(&imcomm_cb); if (comm->am_world_master()) { diff --git a/model_zoo/lbann_dnn_nci.cpp b/model_zoo/lbann_dnn_nci.cpp index a7529fc6bae..947adef973c 100644 --- a/model_zoo/lbann_dnn_nci.cpp +++ b/model_zoo/lbann_dnn_nci.cpp @@ -54,6 +54,7 @@ int main(int argc, char* argv[]) trainParams.MBSize = 50; trainParams.LearnRate = 0.0001; trainParams.DropOut = -1.0f; + trainParams.ProcsPerModel = 0; PerformanceParams perfParams; perfParams.BlockSize = 256; @@ -74,7 +75,7 @@ int main(int argc, char* argv[]) const string test_data = trainParams.DatasetRootDir + trainParams.TestFile; // Set up the communicator and get the grid. - lbann_comm* comm = new lbann_comm(); + lbann_comm* comm = new lbann_comm(trainParams.ProcsPerModel); Grid& grid = comm->get_model_grid(); if (comm->am_world_master()) { cout << "Number of models: " << comm->get_num_models() << endl; diff --git a/src/lbann_comm.cpp b/src/lbann_comm.cpp index 390f7b4fdd4..b12ef2ea509 100644 --- a/src/lbann_comm.cpp +++ b/src/lbann_comm.cpp @@ -27,7 +27,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/lbann_comm.hpp" - +#include "lbann/utils/lbann_exception.hpp" #include "mpi.h" using namespace std; @@ -39,10 +39,17 @@ lbann::lbann_comm::lbann_comm(int _procs_per_model) : procs_per_model(_procs_per_model), num_model_barriers(0), num_intermodel_barriers(0), num_global_barriers(0), bytes_sent(0), bytes_received(0) { + int world_size = mpi::Size(mpi::COMM_WORLD); if (procs_per_model == 0) { - procs_per_model = mpi::Size(mpi::COMM_WORLD); + procs_per_model = world_size; + } + if (procs_per_model > world_size) { + throw lbann_exception("lbann_comm: Not enough processes to create one model"); + } + if (world_size % procs_per_model != 0) { + throw lbann_exception("lbann_comm: Procs per model does not divide total number of procs"); } - num_models = mpi::Size(mpi::COMM_WORLD) / procs_per_model; + num_models = world_size / procs_per_model; model_rank = mpi::Rank(mpi::COMM_WORLD) / procs_per_model; rank_in_model = mpi::Rank(mpi::COMM_WORLD) % procs_per_model; mpi::Split(mpi::COMM_WORLD, model_rank, rank_in_model, model_comm); diff --git a/src/lbann_params.cpp b/src/lbann_params.cpp index f0fa9f9b5c1..c7c30c23125 100644 --- a/src/lbann_params.cpp +++ b/src/lbann_params.cpp @@ -38,7 +38,8 @@ lbann::TrainingParams::TrainingParams(void) LrDecayRate(0.5), LrDecayCycles(5000), ActivationType(activation_type::SIGMOID), DropOut(-1), Lambda(0), DatasetRootDir("."), SaveImageDir("."), ParameterDir("."), - SaveModel(false), LoadModel(false), Checkpoint(10), TrainFile(" "), TestFile(" "){ + SaveModel(false), LoadModel(false), Checkpoint(10), TrainFile(" "), + TestFile(" "), SummaryDir("."), IntermodelCommMethod(0), ProcsPerModel(0) { MaxMBCount = MaxTrainingSamples > 0 ? ceil((double)MaxTrainingSamples / (double)MBSize) : 0; } @@ -76,7 +77,13 @@ void lbann::TrainingParams::parse_params(void) { SaveModel = Input("--save-model", "Save the current model", SaveModel); LoadModel = Input("--load-model", "Load a saved model", LoadModel); Checkpoint = Input("--checkpoint", "Number of training epochs between checkpoints", Checkpoint); + SummaryDir = Input("--summary-dir", "Directory to write summary files", SummaryDir); + IntermodelCommMethod = Input("--imcomm", "Type of inter-model communication", + IntermodelCommMethod); + ProcsPerModel = Input("--procs-per-model", + "Number of processes per model (0 = one model)", + ProcsPerModel); } lbann::PerformanceParams::PerformanceParams(void) : BlockSize(256), MaxParIOSize(0) {}