From a4a0e2206c30f395f25e9aed08f66734d323d229 Mon Sep 17 00:00:00 2001 From: Yusuke Oda Date: Thu, 5 Sep 2024 18:07:30 +0900 Subject: [PATCH] Add --force-stop-iter option (#16) * add --force-stop-iter option * Update megatron/training/training.py Co-authored-by: Kouta Nakayama --------- Co-authored-by: Kouta Nakayama --- megatron/training/arguments.py | 2 ++ megatron/training/training.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 69b740e9..51f1c794 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1688,5 +1688,7 @@ def _add_experimental_args(parser): '`transformer_block.py`, or `transformer_layer.py`') group.add_argument('--yaml-cfg', type=str, default=None, help = 'Config file to add additional arguments') + group.add_argument('--force-stop-iter', type=int, default=None, + help="Stop training process at this iteration regardless of any other configs.") return parser diff --git a/megatron/training/training.py b/megatron/training/training.py index 6fabd7f1..1fe791cf 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1150,7 +1150,7 @@ def track_e2e_metrics(): 'validation_iterations_time_msecs_avg': validation_iterations_time_msecs_avg }) - while iteration < args.train_iters: + while iteration < args.train_iters and (args.force_stop_iter is None or iteration < args.force_stop_iter): if ( # train_data_iterator is not None args.skip_train_iteration_range is not None