Skip to content

Commit

Permalink
Merge branch 'args' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
ndryden committed Jul 18, 2016
2 parents 151b0e2 + 3d3d928 commit df9b3b1
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 15 deletions.
19 changes: 17 additions & 2 deletions experiments/run_lbann_dnn_multi_mnist.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ ACT=1
LRM=1
TEST_W_TRAIN_DATA=0
LR_DECAY=0.5
PROCS_PER_MODEL=
SUMMARY_DIR=
IMCOMM=

RUN="srun"

Expand Down Expand Up @@ -65,6 +68,7 @@ function HELP {
echo "${REV}-f${NORM} <val> --Path to the ${BOLD}datasets${NORM}. Default is ${BOLD}${ROOT_DATASET_DIR}${NORM}."
echo "${REV}-i${NORM} <val> --Sets the ${BOLD}parallel I/O limit${NORM}. Default is ${BOLD}${PARIO}${NORM}."
echo "${REV}-j${NORM} <val> --Sets the ${BOLD}learning rate decay${NORM}. Default is ${BOLD}${LR_DECAY}${NORM}."
echo "${REV}-k${NORM} <val> --Sets the ${BOLD}number of processes per model${NORM}. 0 for one model."
echo "${REV}-l${NORM} <val> --Determines if the model is ${BOLD}loaded${NORM}. Default is ${BOLD}${LOAD_MODEL}${NORM}."
echo "${REV}-m${NORM} <val> --Sets the ${BOLD}mode${NORM}. Default is ${BOLD}${MODE}${NORM}."
echo "${REV}-n${NORM} <val> --Sets the ${BOLD}network topology${NORM}. Default is ${BOLD}${NETWORK}${NORM}."
Expand All @@ -76,12 +80,14 @@ function HELP {
echo "${REV}-t${NORM} <val> --Sets the number of ${BOLD}training samples${NORM}. Default is ${BOLD}${TRAINING_SAMPLES}${NORM}."
echo "${REV}-u${NORM} --Use the ${BOLD}Lustre filesystem${NORM} directly. Default is ${BOLD}${USE_LUSTRE_DIRECT}${NORM}."
echo "${REV}-v${NORM} <val> --Sets the number of ${BOLD}validation samples${NORM}. Default is ${BOLD}${VALIDATION_SAMPLES}${NORM}."
echo "${REV}-w${NORM} <val> --Sets the ${BOLD}summary output directory${NORM}."
echo "${REV}-x${NORM} <val> --Sets the type of ${BOLD}intermodel communication${NORM}."
echo "${REV}-z${NORM} <val> --Sets the ${BOLD}tasks per node${NORM}. Default is ${BOLD}${TASKS_PER_NODE}${NORM}."
echo -e "${REV}-h${NORM} --Displays this help message. No further functions are performed."\\n
exit 1
}

while getopts ":a:b:cde:f:hi:j:l:m:n:o:p:q:r:s:t:uv:z:" opt; do
while getopts ":a:b:cde:f:hi:j:k:l:m:n:o:p:q:r:s:t:uv:w:x:z:" opt; do
case $opt in
a)
ACT=$OPTARG
Expand Down Expand Up @@ -111,6 +117,9 @@ while getopts ":a:b:cde:f:hi:j:l:m:n:o:p:q:r:s:t:uv:z:" opt; do
j)
LR_DECAY=$OPTARG
;;
k)
PROCS_PER_MODEL="--procs-per-model $OPTARG"
;;
l)
LOAD_MODEL=$OPTARG
;;
Expand Down Expand Up @@ -144,6 +153,12 @@ while getopts ":a:b:cde:f:hi:j:l:m:n:o:p:q:r:s:t:uv:z:" opt; do
v)
VALIDATION_SAMPLES=$OPTARG
;;
w)
SUMMARY_DIR="--summary-dir $OPTARG"
;;
x)
IMCOMM="--imcomm $OPTARG"
;;
z)
TASKS_PER_NODE=$OPTARG
;;
Expand Down Expand Up @@ -233,7 +248,7 @@ fi

fi

CMD="${RUN} -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE} ${BINDIR}/lbann_dnn_multi_mnist --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --lr-decay-rate ${LR_DECAY} --lambda 0.1"
CMD="${RUN} -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE} ${BINDIR}/lbann_dnn_multi_mnist --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --lr-decay-rate ${LR_DECAY} --lambda 0.1 ${SUMMARY_DIR} ${IMCOMM} ${PROCS_PER_MODEL}"
#CMD="${RUN} -N1 -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE} --distribution=block --drop-caches=pagecache ${DIRNAME}/lbann_dnn_mnist --par-IO ${PARIO} --dataset ${ROOT_DATASET_DIR}/${DATASET_DIR}/ --max-validation-samples ${VALIDATION_SAMPLES} --profiling true --max-training-samples ${TRAINING_SAMPLES} --block-size ${BLOCK_SIZE} --output ${OUTPUT_DIR} --mode ${MODE} --num-epochs ${EPOCHS} --params ${PARAM_DIR} --save-model ${SAVE_MODEL} --load-model ${LOAD_MODEL} --mb-size ${MB_SIZE} --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --lr-decay-rate ${LR_DECAY}"
echo ${CMD}
${CMD}
6 changes: 6 additions & 0 deletions include/lbann/lbann_params.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,13 @@ namespace lbann
std::string TrainFile;
/// Test data file name
std::string TestFile;
/// Location to write summary files.
std::string SummaryDir;

/// Type of intermodel communication to use, if any.
int IntermodelCommMethod;
/// Number of processes to use in each model (if using multiple).
int ProcsPerModel;
};

/// Performance parameters
Expand Down
3 changes: 2 additions & 1 deletion model_zoo/lbann_dnn_imagenet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,7 @@ int main(int argc, char* argv[])
TrainingParams trainParams;
trainParams.DatasetRootDir = "/p/lscratchf/brainusr/datasets/ILSVRC2012/";
trainParams.DropOut = 0.1;
trainParams.ProcsPerModel = 0;
trainParams.parse_params();
PerformanceParams perfParams;
perfParams.parse_params();
Expand Down Expand Up @@ -657,7 +658,7 @@ int main(int argc, char* argv[])
double sec_all_val = 0;

// Set up the communicator and get the grid.
comm = new lbann_comm();
comm = new lbann_comm(trainParams.ProcsPerModel);
Grid& grid = comm->get_model_grid();
if (comm->am_world_master()) {
cout << "Number of models: " << comm->get_num_models() << endl;
Expand Down
3 changes: 2 additions & 1 deletion model_zoo/lbann_dnn_mnist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ int main(int argc, char* argv[])
trainParams.MBSize = 10;
trainParams.LearnRate = 0.0001;
trainParams.DropOut = -1.0f;
trainParams.ProcsPerModel = 0;
PerformanceParams perfParams;
perfParams.BlockSize = 256;

Expand All @@ -84,7 +85,7 @@ int main(int argc, char* argv[])
SetBlocksize(perfParams.BlockSize);

// Set up the communicator and get the grid.
lbann_comm* comm = new lbann_comm();
lbann_comm* comm = new lbann_comm(trainParams.ProcsPerModel);
Grid& grid = comm->get_model_grid();
if (comm->am_world_master()) {
cout << "Number of models: " << comm->get_num_models() << endl;
Expand Down
17 changes: 11 additions & 6 deletions model_zoo/lbann_dnn_multi_mnist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ int main(int argc, char* argv[])
trainParams.MBSize = 10;
trainParams.LearnRate = 0.0001;
trainParams.DropOut = -1.0f;
trainParams.ProcsPerModel = 12; // Use one Catalyst node.
trainParams.IntermodelCommMethod = static_cast<int>(
lbann_callback_imcomm::COMPRESSED_ADAPTIVE_THRESH_QUANTIZATION);
PerformanceParams perfParams;
perfParams.BlockSize = 256;

Expand All @@ -79,10 +82,11 @@ int main(int argc, char* argv[])
SetBlocksize(perfParams.BlockSize);

// Set up the communicator and get the grid.
lbann_comm* comm = new lbann_comm(12);
lbann_comm* comm = new lbann_comm(trainParams.ProcsPerModel);
Grid& grid = comm->get_model_grid();
if (comm->am_world_master()) {
cout << "Number of models: " << comm->get_num_models() << endl;
cout << "Number of models: " << comm->get_num_models() <<
" (" << comm->get_procs_per_model() << " procs per model)" << endl;
cout << "Grid is " << grid.Height() << " x " << grid.Width() << endl;
cout << endl;
}
Expand Down Expand Up @@ -151,13 +155,13 @@ int main(int argc, char* argv[])
uint fcidx2 = dnn.add(
"FullyConnected", 30, trainParams.ActivationType,
{new dropout(trainParams.DropOut)});
dnn.add("SoftMax", 10);
uint smidx = dnn.add("SoftMax", 10);
target_layer *target_layer = new target_layer_distributed_minibatch(
comm, (int) trainParams.MBSize, &mnist_trainset, &mnist_testset, true);
//target_layer *target_layer = new target_layer_distributed_minibatch_parallel_io(comm, parallel_io, (int) trainParams.MBSize, &mnist_trainset, &mnist_testset, true);
dnn.add(target_layer);

lbann_summary summarizer("/p/lscratchf/dryden1", comm);
lbann_summary summarizer(trainParams.SummaryDir, comm);
// Print out information for each epoch.
lbann_callback_print print_cb;
dnn.add_callback(&print_cb);
Expand All @@ -169,8 +173,9 @@ int main(int argc, char* argv[])
dnn.add_callback(&summary_cb);
// Do global inter-model updates.
lbann_callback_imcomm imcomm_cb(
lbann_callback_imcomm::COMPRESSED_ADAPTIVE_THRESH_QUANTIZATION,
{fcidx1, fcidx2}, &summarizer);
static_cast<lbann_callback_imcomm::comm_type>(
trainParams.IntermodelCommMethod),
{fcidx1, fcidx2, smidx}, &summarizer);
dnn.add_callback(&imcomm_cb);

if (comm->am_world_master()) {
Expand Down
3 changes: 2 additions & 1 deletion model_zoo/lbann_dnn_nci.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ int main(int argc, char* argv[])
trainParams.MBSize = 50;
trainParams.LearnRate = 0.0001;
trainParams.DropOut = -1.0f;
trainParams.ProcsPerModel = 0;
PerformanceParams perfParams;
perfParams.BlockSize = 256;

Expand All @@ -74,7 +75,7 @@ int main(int argc, char* argv[])
const string test_data = trainParams.DatasetRootDir + trainParams.TestFile;

// Set up the communicator and get the grid.
lbann_comm* comm = new lbann_comm();
lbann_comm* comm = new lbann_comm(trainParams.ProcsPerModel);
Grid& grid = comm->get_model_grid();
if (comm->am_world_master()) {
cout << "Number of models: " << comm->get_num_models() << endl;
Expand Down
13 changes: 10 additions & 3 deletions src/lbann_comm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
////////////////////////////////////////////////////////////////////////////////

#include "lbann/lbann_comm.hpp"

#include "lbann/utils/lbann_exception.hpp"
#include "mpi.h"

using namespace std;
Expand All @@ -39,10 +39,17 @@ lbann::lbann_comm::lbann_comm(int _procs_per_model) :
procs_per_model(_procs_per_model), num_model_barriers(0),
num_intermodel_barriers(0), num_global_barriers(0), bytes_sent(0),
bytes_received(0) {
int world_size = mpi::Size(mpi::COMM_WORLD);
if (procs_per_model == 0) {
procs_per_model = mpi::Size(mpi::COMM_WORLD);
procs_per_model = world_size;
}
if (procs_per_model > world_size) {
throw lbann_exception("lbann_comm: Not enough processes to create one model");
}
if (world_size % procs_per_model != 0) {
throw lbann_exception("lbann_comm: Procs per model does not divide total number of procs");
}
num_models = mpi::Size(mpi::COMM_WORLD) / procs_per_model;
num_models = world_size / procs_per_model;
model_rank = mpi::Rank(mpi::COMM_WORLD) / procs_per_model;
rank_in_model = mpi::Rank(mpi::COMM_WORLD) % procs_per_model;
mpi::Split(mpi::COMM_WORLD, model_rank, rank_in_model, model_comm);
Expand Down
9 changes: 8 additions & 1 deletion src/lbann_params.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ lbann::TrainingParams::TrainingParams(void)
LrDecayRate(0.5), LrDecayCycles(5000),
ActivationType(activation_type::SIGMOID), DropOut(-1), Lambda(0),
DatasetRootDir("."), SaveImageDir("."), ParameterDir("."),
SaveModel(false), LoadModel(false), Checkpoint(10), TrainFile(" "), TestFile(" "){
SaveModel(false), LoadModel(false), Checkpoint(10), TrainFile(" "),
TestFile(" "), SummaryDir("."), IntermodelCommMethod(0), ProcsPerModel(0) {

MaxMBCount = MaxTrainingSamples > 0 ? ceil((double)MaxTrainingSamples / (double)MBSize) : 0;
}
Expand Down Expand Up @@ -76,7 +77,13 @@ void lbann::TrainingParams::parse_params(void) {
SaveModel = Input("--save-model", "Save the current model", SaveModel);
LoadModel = Input("--load-model", "Load a saved model", LoadModel);
Checkpoint = Input("--checkpoint", "Number of training epochs between checkpoints", Checkpoint);
SummaryDir = Input("--summary-dir", "Directory to write summary files", SummaryDir);

IntermodelCommMethod = Input("--imcomm", "Type of inter-model communication",
IntermodelCommMethod);
ProcsPerModel = Input("--procs-per-model",
"Number of processes per model (0 = one model)",
ProcsPerModel);
}

lbann::PerformanceParams::PerformanceParams(void) : BlockSize(256), MaxParIOSize(0) {}
Expand Down

0 comments on commit df9b3b1

Please sign in to comment.