Merge branch 'args' into develop

LLNL · Jul 18, 2016 · df9b3b1 · df9b3b1
2 parents 151b0e2 + 3d3d928
commit df9b3b1
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 15 deletions.
diff --git a/experiments/run_lbann_dnn_multi_mnist.sh b/experiments/run_lbann_dnn_multi_mnist.sh
@@ -25,6 +25,9 @@ ACT=1
 LRM=1
 TEST_W_TRAIN_DATA=0
 LR_DECAY=0.5
+PROCS_PER_MODEL=
+SUMMARY_DIR=
+IMCOMM=
 
 RUN="srun"
 
@@ -65,6 +68,7 @@ function HELP {
   echo "${REV}-f${NORM} <val> --Path to the ${BOLD}datasets${NORM}. Default is ${BOLD}${ROOT_DATASET_DIR}${NORM}."  
   echo "${REV}-i${NORM} <val> --Sets the ${BOLD}parallel I/O limit${NORM}. Default is ${BOLD}${PARIO}${NORM}."
   echo "${REV}-j${NORM} <val> --Sets the ${BOLD}learning rate decay${NORM}. Default is ${BOLD}${LR_DECAY}${NORM}."
+  echo "${REV}-k${NORM} <val> --Sets the ${BOLD}number of processes per model${NORM}. 0 for one model."
   echo "${REV}-l${NORM} <val> --Determines if the model is ${BOLD}loaded${NORM}. Default is ${BOLD}${LOAD_MODEL}${NORM}."
   echo "${REV}-m${NORM} <val> --Sets the ${BOLD}mode${NORM}. Default is ${BOLD}${MODE}${NORM}."
   echo "${REV}-n${NORM} <val> --Sets the ${BOLD}network topology${NORM}. Default is ${BOLD}${NETWORK}${NORM}."
@@ -76,12 +80,14 @@ function HELP {
   echo "${REV}-t${NORM} <val> --Sets the number of ${BOLD}training samples${NORM}. Default is ${BOLD}${TRAINING_SAMPLES}${NORM}."
   echo "${REV}-u${NORM}       --Use the ${BOLD}Lustre filesystem${NORM} directly. Default is ${BOLD}${USE_LUSTRE_DIRECT}${NORM}."
   echo "${REV}-v${NORM} <val> --Sets the number of ${BOLD}validation samples${NORM}. Default is ${BOLD}${VALIDATION_SAMPLES}${NORM}."
+  echo "${REV}-w${NORM} <val> --Sets the ${BOLD}summary output directory${NORM}."
+  echo "${REV}-x${NORM} <val> --Sets the type of ${BOLD}intermodel communication${NORM}."
   echo "${REV}-z${NORM} <val> --Sets the ${BOLD}tasks per node${NORM}. Default is ${BOLD}${TASKS_PER_NODE}${NORM}."
   echo -e "${REV}-h${NORM}    --Displays this help message. No further functions are performed."\\n
   exit 1
 }
 
-while getopts ":a:b:cde:f:hi:j:l:m:n:o:p:q:r:s:t:uv:z:" opt; do
+while getopts ":a:b:cde:f:hi:j:k:l:m:n:o:p:q:r:s:t:uv:w:x:z:" opt; do
   case $opt in
     a)
       ACT=$OPTARG
@@ -111,6 +117,9 @@ while getopts ":a:b:cde:f:hi:j:l:m:n:o:p:q:r:s:t:uv:z:" opt; do
     j)
       LR_DECAY=$OPTARG
       ;;
+    k)
+      PROCS_PER_MODEL="--procs-per-model $OPTARG"
+      ;;
     l)
       LOAD_MODEL=$OPTARG
       ;;
@@ -144,6 +153,12 @@ while getopts ":a:b:cde:f:hi:j:l:m:n:o:p:q:r:s:t:uv:z:" opt; do
     v)
       VALIDATION_SAMPLES=$OPTARG
       ;;
+    w)
+      SUMMARY_DIR="--summary-dir $OPTARG"
+      ;;
+    x)
+      IMCOMM="--imcomm $OPTARG"
+      ;;
     z)
       TASKS_PER_NODE=$OPTARG
       ;;
@@ -233,7 +248,7 @@ fi
 
 fi
 
-CMD="${RUN} -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE}  ${BINDIR}/lbann_dnn_multi_mnist  --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --lr-decay-rate ${LR_DECAY} --lambda 0.1"
+CMD="${RUN} -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE}  ${BINDIR}/lbann_dnn_multi_mnist  --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --lr-decay-rate ${LR_DECAY} --lambda 0.1 ${SUMMARY_DIR} ${IMCOMM} ${PROCS_PER_MODEL}"
 #CMD="${RUN} -N1 -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE} --distribution=block --drop-caches=pagecache ${DIRNAME}/lbann_dnn_mnist --par-IO ${PARIO} --dataset ${ROOT_DATASET_DIR}/${DATASET_DIR}/  --max-validation-samples ${VALIDATION_SAMPLES} --profiling true --max-training-samples ${TRAINING_SAMPLES} --block-size ${BLOCK_SIZE} --output ${OUTPUT_DIR} --mode ${MODE} --num-epochs ${EPOCHS} --params ${PARAM_DIR} --save-model ${SAVE_MODEL} --load-model ${LOAD_MODEL} --mb-size ${MB_SIZE} --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --lr-decay-rate ${LR_DECAY}"
 echo ${CMD}
 ${CMD}
diff --git a/include/lbann/lbann_params.hpp b/include/lbann/lbann_params.hpp
@@ -102,7 +102,13 @@ namespace lbann
     std::string TrainFile;
     /// Test data file name
     std::string TestFile;
+    /// Location to write summary files.
+    std::string SummaryDir;
 
+    /// Type of intermodel communication to use, if any.
+    int IntermodelCommMethod;
+    /// Number of processes to use in each model (if using multiple).
+    int ProcsPerModel;
   };
 
   /// Performance parameters

diff --git a/model_zoo/lbann_dnn_imagenet.cpp b/model_zoo/lbann_dnn_imagenet.cpp
@@ -619,6 +619,7 @@ int main(int argc, char* argv[])
         TrainingParams trainParams;
         trainParams.DatasetRootDir = "/p/lscratchf/brainusr/datasets/ILSVRC2012/";
         trainParams.DropOut = 0.1;
+        trainParams.ProcsPerModel = 0;
         trainParams.parse_params();
         PerformanceParams perfParams;
         perfParams.parse_params();
@@ -657,7 +658,7 @@ int main(int argc, char* argv[])
         double sec_all_val = 0;
 
         // Set up the communicator and get the grid.
-        comm = new lbann_comm();
+        comm = new lbann_comm(trainParams.ProcsPerModel);
         Grid& grid = comm->get_model_grid();
         if (comm->am_world_master()) {
           cout << "Number of models: " << comm->get_num_models() << endl;

diff --git a/model_zoo/lbann_dnn_mnist.cpp b/model_zoo/lbann_dnn_mnist.cpp
@@ -71,6 +71,7 @@ int main(int argc, char* argv[])
         trainParams.MBSize = 10;
         trainParams.LearnRate = 0.0001;
         trainParams.DropOut = -1.0f;
+        trainParams.ProcsPerModel = 0;
         PerformanceParams perfParams;
         perfParams.BlockSize = 256;
 
@@ -84,7 +85,7 @@ int main(int argc, char* argv[])
         SetBlocksize(perfParams.BlockSize);
 
         // Set up the communicator and get the grid.
-        lbann_comm* comm = new lbann_comm();
+        lbann_comm* comm = new lbann_comm(trainParams.ProcsPerModel);
         Grid& grid = comm->get_model_grid();
         if (comm->am_world_master()) {
           cout << "Number of models: " << comm->get_num_models() << endl;

diff --git a/model_zoo/lbann_dnn_multi_mnist.cpp b/model_zoo/lbann_dnn_multi_mnist.cpp
@@ -65,6 +65,9 @@ int main(int argc, char* argv[])
     trainParams.MBSize = 10;
     trainParams.LearnRate = 0.0001;
     trainParams.DropOut = -1.0f;
+    trainParams.ProcsPerModel = 12;  // Use one Catalyst node.
+    trainParams.IntermodelCommMethod = static_cast<int>(
+      lbann_callback_imcomm::COMPRESSED_ADAPTIVE_THRESH_QUANTIZATION);
     PerformanceParams perfParams;
     perfParams.BlockSize = 256;
 
@@ -79,10 +82,11 @@ int main(int argc, char* argv[])
     SetBlocksize(perfParams.BlockSize);
 
     // Set up the communicator and get the grid.
-    lbann_comm* comm = new lbann_comm(12);
+    lbann_comm* comm = new lbann_comm(trainParams.ProcsPerModel);
     Grid& grid = comm->get_model_grid();
     if (comm->am_world_master()) {
-      cout << "Number of models: " << comm->get_num_models() << endl;
+      cout << "Number of models: " << comm->get_num_models() << 
+        " (" << comm->get_procs_per_model() << " procs per model)" << endl;
       cout << "Grid is " << grid.Height() << " x " << grid.Width() << endl;
       cout << endl;
     }
@@ -151,13 +155,13 @@ int main(int argc, char* argv[])
     uint fcidx2 = dnn.add(
       "FullyConnected", 30, trainParams.ActivationType,
       {new dropout(trainParams.DropOut)});
-    dnn.add("SoftMax", 10);
+    uint smidx = dnn.add("SoftMax", 10);
     target_layer *target_layer = new target_layer_distributed_minibatch(
       comm, (int) trainParams.MBSize, &mnist_trainset, &mnist_testset, true);
     //target_layer *target_layer = new target_layer_distributed_minibatch_parallel_io(comm, parallel_io, (int) trainParams.MBSize, &mnist_trainset, &mnist_testset, true);
     dnn.add(target_layer);
 
-    lbann_summary summarizer("/p/lscratchf/dryden1", comm);
+    lbann_summary summarizer(trainParams.SummaryDir, comm);
     // Print out information for each epoch.
     lbann_callback_print print_cb;
     dnn.add_callback(&print_cb);
@@ -169,8 +173,9 @@ int main(int argc, char* argv[])
     dnn.add_callback(&summary_cb);
     // Do global inter-model updates.
     lbann_callback_imcomm imcomm_cb(
-      lbann_callback_imcomm::COMPRESSED_ADAPTIVE_THRESH_QUANTIZATION,
-      {fcidx1, fcidx2}, &summarizer);
+      static_cast<lbann_callback_imcomm::comm_type>(
+        trainParams.IntermodelCommMethod),
+      {fcidx1, fcidx2, smidx}, &summarizer);
     dnn.add_callback(&imcomm_cb);
 
     if (comm->am_world_master()) {

diff --git a/model_zoo/lbann_dnn_nci.cpp b/model_zoo/lbann_dnn_nci.cpp
@@ -54,6 +54,7 @@ int main(int argc, char* argv[])
       trainParams.MBSize = 50;
       trainParams.LearnRate = 0.0001;
       trainParams.DropOut = -1.0f;
+      trainParams.ProcsPerModel = 0;
       PerformanceParams perfParams;
       perfParams.BlockSize = 256;
 
@@ -74,7 +75,7 @@ int main(int argc, char* argv[])
       const string test_data  = trainParams.DatasetRootDir + trainParams.TestFile;
 
         // Set up the communicator and get the grid.
-      lbann_comm* comm = new lbann_comm();
+      lbann_comm* comm = new lbann_comm(trainParams.ProcsPerModel);
       Grid& grid = comm->get_model_grid();
       if (comm->am_world_master()) {
         cout << "Number of models: " << comm->get_num_models() << endl;

diff --git a/src/lbann_comm.cpp b/src/lbann_comm.cpp
@@ -27,7 +27,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/lbann_comm.hpp"
-
+#include "lbann/utils/lbann_exception.hpp"
 #include "mpi.h"
 
 using namespace std;
@@ -39,10 +39,17 @@ lbann::lbann_comm::lbann_comm(int _procs_per_model) :
   procs_per_model(_procs_per_model), num_model_barriers(0),
   num_intermodel_barriers(0), num_global_barriers(0), bytes_sent(0),
   bytes_received(0) {
+  int world_size = mpi::Size(mpi::COMM_WORLD);
   if (procs_per_model == 0) {
-    procs_per_model = mpi::Size(mpi::COMM_WORLD);
+    procs_per_model = world_size;
+  }
+  if (procs_per_model > world_size) {
+    throw lbann_exception("lbann_comm: Not enough processes to create one model");
+  }
+  if (world_size % procs_per_model != 0) {
+    throw lbann_exception("lbann_comm: Procs per model does not divide total number of procs");
   }
-  num_models = mpi::Size(mpi::COMM_WORLD) / procs_per_model;
+  num_models = world_size / procs_per_model;
   model_rank = mpi::Rank(mpi::COMM_WORLD) / procs_per_model;
   rank_in_model = mpi::Rank(mpi::COMM_WORLD) % procs_per_model;
   mpi::Split(mpi::COMM_WORLD, model_rank, rank_in_model, model_comm);

diff --git a/src/lbann_params.cpp b/src/lbann_params.cpp
@@ -38,7 +38,8 @@ lbann::TrainingParams::TrainingParams(void)
     LrDecayRate(0.5), LrDecayCycles(5000),
     ActivationType(activation_type::SIGMOID), DropOut(-1), Lambda(0),
     DatasetRootDir("."), SaveImageDir("."), ParameterDir("."),
-    SaveModel(false), LoadModel(false), Checkpoint(10), TrainFile(" "), TestFile(" "){
+    SaveModel(false), LoadModel(false), Checkpoint(10), TrainFile(" "),
+    TestFile(" "), SummaryDir("."), IntermodelCommMethod(0), ProcsPerModel(0) {
 
   MaxMBCount = MaxTrainingSamples > 0 ? ceil((double)MaxTrainingSamples / (double)MBSize) : 0;
 }
@@ -76,7 +77,13 @@ void lbann::TrainingParams::parse_params(void) {
   SaveModel = Input("--save-model", "Save the current model", SaveModel);
   LoadModel = Input("--load-model", "Load a saved model", LoadModel);
   Checkpoint = Input("--checkpoint", "Number of training epochs between checkpoints", Checkpoint);
+  SummaryDir = Input("--summary-dir", "Directory to write summary files", SummaryDir);
 
+  IntermodelCommMethod = Input("--imcomm", "Type of inter-model communication",
+                               IntermodelCommMethod);
+  ProcsPerModel = Input("--procs-per-model",
+                        "Number of processes per model (0 = one model)",
+                        ProcsPerModel);
 }
 
 lbann::PerformanceParams::PerformanceParams(void) : BlockSize(256), MaxParIOSize(0) {}