From 4d71396b57d28b2a00c951e40e349aef59aaac88 Mon Sep 17 00:00:00 2001
From: timhartill <timhartill@gmail.com>
Date: Wed, 31 Mar 2021 12:42:59 +1300
Subject: [PATCH 1/8] add tjh subdir with gpu_visibility.py

---
 tjh/gpu_visibility.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 tjh/gpu_visibility.py

diff --git a/tjh/gpu_visibility.py b/tjh/gpu_visibility.py
new file mode 100644
index 0000000..68b7184
--- /dev/null
+++ b/tjh/gpu_visibility.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+"""
+Methods of setting GPU visibility
+
+export CUDA_VISIBLE_DEVICES=5,6
+
+or 
+
+CUDA_VISIBLE_DEVICES=5,6 python test_script.py
+
+or
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "5,6"
+  
+
+"""
+

From e1c4b123219ed5010bc96216f3cc31226912bc29 Mon Sep 17 00:00:00 2001
From: timhartill <timhartill@gmail.com>
Date: Wed, 31 Mar 2021 13:43:13 +1300
Subject: [PATCH 2/8] add basic_tests.py

---
 tjh/basic_tests.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 tjh/basic_tests.py

diff --git a/tjh/basic_tests.py b/tjh/basic_tests.py
new file mode 100644
index 0000000..db91b87
--- /dev/null
+++ b/tjh/basic_tests.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Mar 31 13:05:08 2021
+
+@author: thar011
+"""
+
+from transformers import AutoTokenizer, T5ForConditionalGeneration
+
+model_name = "allenai/unifiedqa-t5-small" # you can specify the model size here
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = T5ForConditionalGeneration.from_pretrained(model_name)
+
+def run_model(input_string, **generator_args):
+    input_ids = tokenizer.encode(input_string, return_tensors="pt")
+    res = model.generate(input_ids, **generator_args)
+    return tokenizer.batch_decode(res, skip_special_tokens=True)
+
+
+run_model("which is best conductor? \\n (a) iron (b) feather")
+
+run_model("scott filled a tray with juice and put it in a freezer. the next day, scott opened the freezer. how did the juice most likely change? \\n (a) it condensed. (b) it evaporated. (c) it became a gas. (d) it became a solid.")
+
+run_model("which is best conductor? \\n (a) iron (b) feather (c) wood (d) plastic",
+         temperature=0.9, num_return_sequences=4, num_beams=20)

From 0fdfb9ca14074f57518fdcb8fe2ca71df8fd36c0 Mon Sep 17 00:00:00 2001
From: timhartill <timhartill@gmail.com>
Date: Wed, 31 Mar 2021 18:29:14 +1300
Subject: [PATCH 3/8] updating bart.py

---
 bart/bart.py         | 46 ++++++++++++++++++++++++++++++++++--
 bart/unified_data.py |  4 ++--
 tjh/basic_tests.py   | 56 ++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/bart/bart.py b/bart/bart.py
index d38aa48..e320201 100644
--- a/bart/bart.py
+++ b/bart/bart.py
@@ -4,9 +4,31 @@
 from transformers import T5ForConditionalGeneration, BartForConditionalGeneration
 
 class MyBart(BartForConditionalGeneration):
+    """ TJH: adding , past_key_values=None to forward(..) takes us to next keyword error 'head_mask'
+         Original forward below replaced with new forward (and new =model(...) below)
     def forward(self, input_ids, attention_mask=None, encoder_outputs=None,
             decoder_input_ids=None, decoder_attention_mask=None, decoder_cached_states=None,
             use_cache=False, is_training=False):
+    """
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_training=False  # TJH added back as UnifiedQA specific
+    ):
 
         if is_training:
             decoder_start_token_id = self.config.decoder_start_token_id
@@ -16,15 +38,35 @@ def forward(self, input_ids, attention_mask=None, encoder_outputs=None,
         else:
             _decoder_input_ids = decoder_input_ids.clone()
 
+        # original unifiedQA definition:
+        #outputs = self.model(
+        #    input_ids,
+        #    attention_mask=attention_mask,
+        #    encoder_outputs=encoder_outputs,
+        #    decoder_input_ids=_decoder_input_ids,
+        #    decoder_attention_mask=decoder_attention_mask,
+        #    decoder_cached_states=decoder_cached_states,
+        #    use_cache=use_cache
+        #)
+        
+        # below from modelling_bart.py
         outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
             encoder_outputs=encoder_outputs,
-            decoder_input_ids=_decoder_input_ids,
             decoder_attention_mask=decoder_attention_mask,
-            decoder_cached_states=decoder_cached_states,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
             use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
+        
         lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias)
         if is_training:
             loss_fct = nn.CrossEntropyLoss(reduce=False)
diff --git a/bart/unified_data.py b/bart/unified_data.py
index c5f8a35..b2c2f20 100644
--- a/bart/unified_data.py
+++ b/bart/unified_data.py
@@ -23,8 +23,8 @@ def __init__(self, logger, args, data_path, is_training):
             "boolq",
             "race_string",
             "openbookqa"]
-        self.data_path = data_path
-        self.data_type = data_path.split("/")[-1][:-4]
+        self.data_path = data_path  #TJH this should be ../unifiedqa/train.tsv
+        self.data_type = data_path.split("/")[-1][:-4]    #TJH strip .tsv from filename appearing after final "/"
         assert self.data_type in ["train", "dev", "test"]
 
         if args.debug:
diff --git a/tjh/basic_tests.py b/tjh/basic_tests.py
index db91b87..3f3d7ea 100644
--- a/tjh/basic_tests.py
+++ b/tjh/basic_tests.py
@@ -4,11 +4,23 @@
 Created on Wed Mar 31 13:05:08 2021
 
 @author: thar011
+
+UnifiedQA initial tests:
+    T5 checkpoints tests
+    followed by BART tests    
+
 """
 
+# UnifiedQA T5 checkpoints tests:
+    
 from transformers import AutoTokenizer, T5ForConditionalGeneration
 
-model_name = "allenai/unifiedqa-t5-small" # you can specify the model size here
+# inference on <= 3B models work
+#model_name = "allenai/unifiedqa-t5-small" # you can specify the model size here
+#model_name = "allenai/unifiedqa-t5-base" # you can specify the model size here
+model_name = "allenai/unifiedqa-t5-large" # you can specify the model size here
+#model_name = "allenai/unifiedqa-t5-3b" # you can specify the model size here
+model_name = "allenai/unifiedqa-t5-11b" # you can specify the model size here
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = T5ForConditionalGeneration.from_pretrained(model_name)
 
@@ -18,9 +30,49 @@ def run_model(input_string, **generator_args):
     return tokenizer.batch_decode(res, skip_special_tokens=True)
 
 
-run_model("which is best conductor? \\n (a) iron (b) feather")
+run_model("which is best conductor? \\n (a) iron (b) feather")  #['iron']
+run_model("which is best conductor? \\n ") # ['no answer>']
+run_model("which is best conductor - iron or feather? \\n ") # ['iron']
+run_model("Name a conductor of electricty? \\n Name any conductor") # ['any conductor']
+run_model("Name a conductor of electricty? \\n ") # ['yes']
+run_model("Name a conductor of electricity: \\n ") # ['yes']
+run_model("What is 53 + 9521? \\n ") # ['no answer>']
+
+
 
 run_model("scott filled a tray with juice and put it in a freezer. the next day, scott opened the freezer. how did the juice most likely change? \\n (a) it condensed. (b) it evaporated. (c) it became a gas. (d) it became a solid.")
 
 run_model("which is best conductor? \\n (a) iron (b) feather (c) wood (d) plastic",
          temperature=0.9, num_return_sequences=4, num_beams=20)
+
+
+# BART tests (run from  unifiedqa-tjh/bart directory):
+    
+import torch
+from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
+from bart import MyBart
+
+base_model = "facebook/bart-large"
+#unifiedqa_path = "unifiedQA-uncased/best-model.pt" # path to the downloaded checkpoint
+unifiedqa_path = "/data/thar011/ckpts/unifiedqa-bart-large-allenai/unifiedQA-uncased/best-model.pt" # path to the downloaded checkpoint
+
+tokenizer = BartTokenizer.from_pretrained(base_model)
+model = MyBart.from_pretrained(base_model, state_dict=torch.load(unifiedqa_path))
+model.eval()
+
+# ERROR: TypeError: forward() got an unexpected keyword argument 'past_key_values'
+x = model.generate_from_string("Which is best conductor? \\n (A) iron (B) feather", tokenizer=tokenizer)
+print(x)
+
+x = model.generate_from_string("What is the sum of 3 and 5? \\n (A) 8 (B) 3 (C) 5 (D) 10", tokenizer=tokenizer)
+print(x)
+
+
+#try basic bart model (no error):    
+model = BartForConditionalGeneration.from_pretrained(base_model)
+model.eval()
+run_model("which is best conductor? \\n (a) iron (b) feather") #['whichwhich is best conductor?']
+
+
+
+

From 749b6746cb710172e8365925823c0f0e7d51e9f3 Mon Sep 17 00:00:00 2001
From: timhartill <timhartill@gmail.com>
Date: Thu, 1 Apr 2021 18:00:59 +1300
Subject: [PATCH 4/8] updated bart.py and run.py

---
 bart/bart.py | 2 +-
 bart/run.py  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/bart/bart.py b/bart/bart.py
index e320201..05d1425 100644
--- a/bart/bart.py
+++ b/bart/bart.py
@@ -53,7 +53,7 @@ def forward(
         outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
+            decoder_input_ids=_decoder_input_ids,   #TJH added underscore
             encoder_outputs=encoder_outputs,
             decoder_attention_mask=decoder_attention_mask,
             head_mask=head_mask,
diff --git a/bart/run.py b/bart/run.py
index cde10b3..c3bc735 100644
--- a/bart/run.py
+++ b/bart/run.py
@@ -11,7 +11,7 @@
 from bart import MyBart
 
 def run(args, logger):
-    tokenizer = BartTokenizer.from_pretrained("bart-large")
+    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
 
     if args.is_unifiedqa:
         dev_data = UnifiedQAData(logger, args, args.predict_file, False)
@@ -31,10 +31,10 @@ def run(args, logger):
         train_data.load_dataloader()
 
         if args.checkpoint is not None:
-            model = MyBart.from_pretrained("bart-large",
+            model = MyBart.from_pretrained("facebook/bart-large",
                                            state_dict=torch.load(args.checkpoint))
         else:
-            model = MyBart.from_pretrained("bart-large")
+            model = MyBart.from_pretrained("facebook/bart-large")
         if args.n_gpu>1:
             model = torch.nn.DataParallel(model)
         if args.n_gpu>0:
@@ -53,7 +53,7 @@ def run(args, logger):
 
     if args.do_predict:
         checkpoint = os.path.join(args.output_dir, 'best-model.pt') if args.checkpoint is None else args.checkpoint
-        model = MyBart.from_pretrained("bart-large",
+        model = MyBart.from_pretrained("facebook/bart-large",
                                        state_dict=torch.load(checkpoint))
         logger.info("Loading checkpoint from {}".format(checkpoint))
         if args.n_gpu>0:

From d7f744092cf2c5d3fa9ead7c613f5301176454f2 Mon Sep 17 00:00:00 2001
From: timhartill <timhartill@gmail.com>
Date: Fri, 2 Apr 2021 22:05:47 +1300
Subject: [PATCH 5/8] working bart.py and run.py

---
 bart/bart.py | 83 ++++++++++++++++++++++++++++++++++++----------------
 bart/run.py  | 20 ++++++++-----
 2 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/bart/bart.py b/bart/bart.py
index 05d1425..a594fc0 100644
--- a/bart/bart.py
+++ b/bart/bart.py
@@ -2,13 +2,34 @@
 import torch.nn.functional as F
 from torch import Tensor, nn
 from transformers import T5ForConditionalGeneration, BartForConditionalGeneration
+from transformers.modeling_outputs import Seq2SeqLMOutput  #TJH added
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """ TJH: from  modelling_bart.py NOT currently used
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
 
 class MyBart(BartForConditionalGeneration):
     """ TJH: adding , past_key_values=None to forward(..) takes us to next keyword error 'head_mask'
+
          Original forward below replaced with new forward (and new =model(...) below)
     def forward(self, input_ids, attention_mask=None, encoder_outputs=None,
             decoder_input_ids=None, decoder_attention_mask=None, decoder_cached_states=None,
             use_cache=False, is_training=False):
+        
+        New version assumes that for training, decoder inputs are in labels
+        and for generation, decoder inputs are in decoder_input_ids
     """
     def forward(
         self,
@@ -22,38 +43,31 @@ def forward(
         past_key_values=None,
         inputs_embeds=None,
         decoder_inputs_embeds=None,
-        labels=None,
+        labels=None,  #TJH In 4.4.2 labels contains what in unifiedqa is called decoder_input_ids
         use_cache=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
-        is_training=False  # TJH added back as UnifiedQA specific
     ):
+        #TJH: Added for compatibility with 4.4.2
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if is_training:
-            decoder_start_token_id = self.config.decoder_start_token_id
-            _decoder_input_ids = decoder_input_ids.new_zeros(decoder_input_ids.shape)
-            _decoder_input_ids[..., 1:] = decoder_input_ids[..., :-1].clone()
-            _decoder_input_ids[..., 0] = decoder_start_token_id
-        else:
-            _decoder_input_ids = decoder_input_ids.clone()
-
-        # original unifiedQA definition:
-        #outputs = self.model(
-        #    input_ids,
-        #    attention_mask=attention_mask,
-        #    encoder_outputs=encoder_outputs,
-        #    decoder_input_ids=_decoder_input_ids,
-        #    decoder_attention_mask=decoder_attention_mask,
-        #    decoder_cached_states=decoder_cached_states,
-        #    use_cache=use_cache
-        #)
+        if labels is not None:  #TJH added for compatibility with other 4.4.2 seq2seq models
+            if decoder_input_ids is None:
+                #TJH: how it is done in modelling_bart.py. Using the unifiedQA method instead
+#                decoder_input_ids = shift_tokens_right(
+#                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+#                )
+                decoder_start_token_id = self.config.decoder_start_token_id
+                decoder_input_ids = labels.new_zeros(labels.shape)
+                decoder_input_ids[..., 1:] = labels[..., :-1].clone()
+                decoder_input_ids[..., 0] = decoder_start_token_id    
         
-        # below from modelling_bart.py
+        # TJH: below from modeling_bart.py
         outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
-            decoder_input_ids=_decoder_input_ids,   #TJH added underscore
+            decoder_input_ids=decoder_input_ids,   #TJH: no underscore
             encoder_outputs=encoder_outputs,
             decoder_attention_mask=decoder_attention_mask,
             head_mask=head_mask,
@@ -68,13 +82,30 @@ def forward(
         )
         
         lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias)
-        if is_training:
+        
+        loss = None
+        if labels is not None:   #TJH added labels is not None instead of is_training
             loss_fct = nn.CrossEntropyLoss(reduce=False)
             losses = loss_fct(lm_logits.view(-1, self.config.vocab_size),
-                              decoder_input_ids.view(-1))
+                              labels.view(-1))
             loss = torch.sum(losses * decoder_attention_mask.float().view(-1))
-            return loss
-        return (lm_logits, ) + outputs[1:]
+
+        if not return_dict: #TJH: from modeling_bart.py
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output    
+    
+        return Seq2SeqLMOutput( #TJH: from modeling_bart.py. 
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
 
     def generate_from_string(self, _input, tokenizer=None, **generator_args):
         assert tokenizer is not None
diff --git a/bart/run.py b/bart/run.py
index c3bc735..76b2d84 100644
--- a/bart/run.py
+++ b/bart/run.py
@@ -11,7 +11,7 @@
 from bart import MyBart
 
 def run(args, logger):
-    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")  #TJH: bart-large
 
     if args.is_unifiedqa:
         dev_data = UnifiedQAData(logger, args, args.predict_file, False)
@@ -32,9 +32,9 @@ def run(args, logger):
 
         if args.checkpoint is not None:
             model = MyBart.from_pretrained("facebook/bart-large",
-                                           state_dict=torch.load(args.checkpoint))
+                                           state_dict=torch.load(args.checkpoint)) #TJH: bart-large
         else:
-            model = MyBart.from_pretrained("facebook/bart-large")
+            model = MyBart.from_pretrained("facebook/bart-large") #TJH: bart-large
         if args.n_gpu>1:
             model = torch.nn.DataParallel(model)
         if args.n_gpu>0:
@@ -54,7 +54,7 @@ def run(args, logger):
     if args.do_predict:
         checkpoint = os.path.join(args.output_dir, 'best-model.pt') if args.checkpoint is None else args.checkpoint
         model = MyBart.from_pretrained("facebook/bart-large",
-                                       state_dict=torch.load(checkpoint))
+                                       state_dict=torch.load(checkpoint)) #TJH: bart-large
         logger.info("Loading checkpoint from {}".format(checkpoint))
         if args.n_gpu>0:
             model.to(torch.device("cuda"))
@@ -86,9 +86,13 @@ def _convert(key):
         for batch in train_data.dataloader:
             global_step += 1
             batch = [b.to(torch.device("cuda")) for b in batch]
-            loss = model(input_ids=batch[0], attention_mask=batch[1],
-                         decoder_input_ids=batch[2], decoder_attention_mask=batch[3],
-                         is_training=True)
+# TJH: this was the original unifiedqa:            
+#            loss = model(input_ids=batch[0], attention_mask=batch[1],
+#                         decoder_input_ids=batch[2], decoder_attention_mask=batch[3],
+#                         is_training=True)
+            outputs = model(input_ids=batch[0], attention_mask=batch[1],
+                         labels=batch[2], decoder_attention_mask=batch[3])  
+            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]  #TJH added 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu.
             if torch.isnan(loss).data:
@@ -155,7 +159,7 @@ def inference(model, dev_data, save_predictions=False):
         outputs = model.generate(input_ids=batch[0],
                                  attention_mask=batch[1],
                                  num_beams=dev_data.args.num_beams,
-                                 min_lnegth=1,
+                                 min_length=1,  #TJH: was min_lnegth
                                  max_length=dev_data.args.max_output_length,
                                  early_stopping=True,)
         for input_, output in zip(batch[0], outputs):

From ae7797bf67d5835a5ab793f4f183544743f42469 Mon Sep 17 00:00:00 2001
From: timhartill <timhartill@gmail.com>
Date: Fri, 2 Apr 2021 22:07:02 +1300
Subject: [PATCH 6/8] add runtrain.sh

---
 bart/rundropft.sh | 14 ++++++++++++++
 bart/runtrain.sh  | 16 ++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 bart/rundropft.sh
 create mode 100644 bart/runtrain.sh

diff --git a/bart/rundropft.sh b/bart/rundropft.sh
new file mode 100644
index 0000000..66dc574
--- /dev/null
+++ b/bart/rundropft.sh
@@ -0,0 +1,14 @@
+# training command
+# other options:  --do_predict --skip_inference --debug --checkpoint ${unifiedqa_checkpoint}
+# --prefix dev_  --prefix test_ --checkpoint_step
+
+python cli.py --do_train --output_dir /data/thar011/out/unifiedqa_dropft \
+        --checkpoint /data/thar011/ckpts/unifiedqa-bart-large-allenai/unifiedQA-uncased/best-model.pt \
+        --is_unifiedqa \
+        --train_file /data/thar011/data/unifiedqa/train.tsv \
+        --predict_file /data/thar011/data/unifiedqa/dev.tsv \
+        --train_batch_size 64 \
+        --predict_batch_size 64 \
+        --append_another_bos --do_lowercase \
+        --eval_period 10000 --verbose
+        
diff --git a/bart/runtrain.sh b/bart/runtrain.sh
new file mode 100644
index 0000000..6fbd38a
--- /dev/null
+++ b/bart/runtrain.sh
@@ -0,0 +1,16 @@
+# training command
+# other options:  --do_predict --skip_inference --debug --checkpoint ${unifiedqa_checkpoint}
+# --prefix dev_  --prefix test_
+
+python cli.py --do_train --output_dir /data/thar011/out/unifiedqa \
+        --is_unifiedqa \
+        --checkpoint /data/thar011/ckpts/unifiedqa-bart-large-allenai/unifiedQA-uncased/best-model.pt \
+        --train_file /data/thar011/data/unifiedqa/train.tsv \
+        --predict_file /data/thar011/data/unifiedqa/dev.tsv \
+        --train_batch_size 64 \
+        --predict_batch_size 64 \
+        --append_another_bos --do_lowercase \
+        --eval_period 10000 --verbose \
+        --num_train_epochs 10000 \
+        --gradient_accumulation_steps 1
+

From c1ec42e65cc6f2bd2e66ca086c3c7c12401b73ac Mon Sep 17 00:00:00 2001
From: timhartill <timhartill@gmail.com>
Date: Sat, 3 Apr 2021 00:03:28 +1300
Subject: [PATCH 7/8] add .gitignore + more logging

---
 .gitignore       |  9 +++++++++
 bart/bart.py     |  2 +-
 bart/run.py      | 11 ++++++++++-
 bart/runtrain.sh |  5 +++--
 4 files changed, 23 insertions(+), 4 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ce36dcd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+# Ignore files
+*.pyc
+extract_features_timbackup.py
+
+# Ignore directories
+features/
+dataset/videos/
+__pycache__
+
diff --git a/bart/bart.py b/bart/bart.py
index a594fc0..28c3b59 100644
--- a/bart/bart.py
+++ b/bart/bart.py
@@ -84,7 +84,7 @@ def forward(
         lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias)
         
         loss = None
-        if labels is not None:   #TJH added labels is not None instead of is_training
+        if labels is not None:   #TJH labels is not None instead of is_training
             loss_fct = nn.CrossEntropyLoss(reduce=False)
             losses = loss_fct(lm_logits.view(-1, self.config.vocab_size),
                               labels.view(-1))
diff --git a/bart/run.py b/bart/run.py
index 76b2d84..8b6bc4a 100644
--- a/bart/run.py
+++ b/bart/run.py
@@ -32,7 +32,8 @@ def run(args, logger):
 
         if args.checkpoint is not None:
             model = MyBart.from_pretrained("facebook/bart-large",
-                                           state_dict=torch.load(args.checkpoint)) #TJH: bart-large
+                                           state_dict=torch.load(args.checkpoint))  #TJH: bart-large
+            logger.info("Loading checkpoint from {}".format(args.checkpoint))       #TJH Added
         else:
             model = MyBart.from_pretrained("facebook/bart-large") #TJH: bart-large
         if args.n_gpu>1:
@@ -83,7 +84,11 @@ def _convert(key):
 
     logger.info("Starting training!")
     for epoch in range(int(args.num_train_epochs)):
+        if args.verbose: 
+            logger.info("Starting Epoch %d" % (epoch))  #TJH added
         for batch in train_data.dataloader:
+            if args.verbose and global_step % 100 == 0:
+                logger.info("Epoch %d   Global Step %d" % (epoch, global_step))   #TJH Added
             global_step += 1
             batch = [b.to(torch.device("cuda")) for b in batch]
 # TJH: this was the original unifiedqa:            
@@ -121,6 +126,8 @@ def _convert(key):
                     torch.save(model_state_dict, os.path.join(args.output_dir,
                                                               "best-model-{}.pt".format(str(global_step).zfill(6))))
                 else:
+                    if args.verbose:
+                        logger.info("Step %d Starting inference.." % (global_step)) #TJH Added
                     model.eval()
                     curr_em = inference(model if args.n_gpu==1 else model.module, dev_data)
                     logger.info("Step %d Train loss %.2f %s %.2f%% on epoch=%d" % (
@@ -142,8 +149,10 @@ def _convert(key):
                         stop_training = False
                     else:
                         wait_step += 1
+                        logger.info("No improvement. Number of wait steps: %d of max wait steps: %d" % (wait_step, args.wait_step))
                         if wait_step >= args.wait_step:
                             stop_training = True
+                            logger.info("Early Stopping due to no improvement after %d wait steps!" % (wait_step))   #TJH Added
                             break
                 model.train()
         if stop_training:
diff --git a/bart/runtrain.sh b/bart/runtrain.sh
index 6fbd38a..894e2df 100644
--- a/bart/runtrain.sh
+++ b/bart/runtrain.sh
@@ -2,7 +2,7 @@
 # other options:  --do_predict --skip_inference --debug --checkpoint ${unifiedqa_checkpoint}
 # --prefix dev_  --prefix test_
 
-python cli.py --do_train --output_dir /data/thar011/out/unifiedqa \
+python cli.py --do_train --output_dir /data/thar011/out/unifiedqa_1gputest \
         --is_unifiedqa \
         --checkpoint /data/thar011/ckpts/unifiedqa-bart-large-allenai/unifiedQA-uncased/best-model.pt \
         --train_file /data/thar011/data/unifiedqa/train.tsv \
@@ -12,5 +12,6 @@ python cli.py --do_train --output_dir /data/thar011/out/unifiedqa \
         --append_another_bos --do_lowercase \
         --eval_period 10000 --verbose \
         --num_train_epochs 10000 \
-        --gradient_accumulation_steps 1
+        --gradient_accumulation_steps 1 \
+        --wait_step 10
 

From 7ce580578b5bcd472232784ccbd22a17e79877f2 Mon Sep 17 00:00:00 2001
From: timhartill <timhartill@gmail.com>
Date: Mon, 5 Apr 2021 17:02:00 +1200
Subject: [PATCH 8/8] updated tokenizer params in data.py unified_data.py

---
 bart/data.py         |  6 ++++--
 bart/runpredict.sh   | 18 ++++++++++++++++++
 bart/runtrain.sh     | 10 +++++-----
 bart/unified_data.py | 12 ++++++++----
 4 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 bart/runpredict.sh

diff --git a/bart/data.py b/bart/data.py
index dbecf3e..327c7b4 100644
--- a/bart/data.py
+++ b/bart/data.py
@@ -110,10 +110,12 @@ def load_dataset(self, tokenizer, do_return=False):
                 questions = ["<s> "+question for question in questions]
                 answers = ["<s> " +answer for answer in answers]
             question_input = tokenizer.batch_encode_plus(questions,
-                                                         pad_to_max_length=True,
+                                                         truncation=True,       #TJH added
+                                                         padding='max_length',  #TJH was pad_to_max_length=True,
                                                          max_length=self.args.max_input_length)
             answer_input = tokenizer.batch_encode_plus(answers,
-                                                       pad_to_max_length=True,
+                                                       truncation=True,       #TJH added
+                                                       padding='max_length',  #TJH was pad_to_max_length=True,
                                                        max_length=self.args.max_output_length)
             input_ids, attention_mask = question_input["input_ids"], question_input["attention_mask"]
             decoder_input_ids, decoder_attention_mask = answer_input["input_ids"], answer_input["attention_mask"]
diff --git a/bart/runpredict.sh b/bart/runpredict.sh
new file mode 100644
index 0000000..0eaae10
--- /dev/null
+++ b/bart/runpredict.sh
@@ -0,0 +1,18 @@
+#run predictions - picks up best_model from output_dir otherwise can specify --checkpoint
+
+python cli.py --do_predict --output_dir /data/thar011/out/unifiedqa_2gputest_from_uqackpt \
+        --predict_file /data/thar011/data/unifiedqa/drop/dev.tsv \
+        --predict_batch_size 64 \
+        --append_another_bos --do_lowercase \
+        --verbose \
+        --prefix dev_drop_
+
+
+python cli.py --do_predict --output_dir /data/thar011/out/unifiedqa_2gputest_from_uqackpt \
+        --predict_file /data/thar011/data/unifiedqa/ropes/dev.tsv \
+        --predict_batch_size 64 \
+        --append_another_bos --do_lowercase \
+        --verbose \
+        --prefix dev_ropes_
+
+
diff --git a/bart/runtrain.sh b/bart/runtrain.sh
index 894e2df..891fc80 100644
--- a/bart/runtrain.sh
+++ b/bart/runtrain.sh
@@ -1,17 +1,17 @@
 # training command
 # other options:  --do_predict --skip_inference --debug --checkpoint ${unifiedqa_checkpoint}
 # --prefix dev_  --prefix test_
+#         --checkpoint /data/thar011/ckpts/unifiedqa-bart-large-allenai/unifiedQA-uncased/best-model.pt \
 
-python cli.py --do_train --output_dir /data/thar011/out/unifiedqa_1gputest \
+python cli.py --do_train --output_dir /data/thar011/out/unifiedqa_2gputest_from_bart \
         --is_unifiedqa \
-        --checkpoint /data/thar011/ckpts/unifiedqa-bart-large-allenai/unifiedQA-uncased/best-model.pt \
         --train_file /data/thar011/data/unifiedqa/train.tsv \
         --predict_file /data/thar011/data/unifiedqa/dev.tsv \
-        --train_batch_size 64 \
-        --predict_batch_size 64 \
+        --train_batch_size 32 \
+        --predict_batch_size 32 \
         --append_another_bos --do_lowercase \
         --eval_period 10000 --verbose \
         --num_train_epochs 10000 \
-        --gradient_accumulation_steps 1 \
+        --gradient_accumulation_steps 2 \
         --wait_step 10
 
diff --git a/bart/unified_data.py b/bart/unified_data.py
index b2c2f20..be19d5e 100644
--- a/bart/unified_data.py
+++ b/bart/unified_data.py
@@ -23,7 +23,7 @@ def __init__(self, logger, args, data_path, is_training):
             "boolq",
             "race_string",
             "openbookqa"]
-        self.data_path = data_path  #TJH this should be ../unifiedqa/train.tsv
+        self.data_path = data_path  #TJH this would be ../unifiedqa/train.tsv
         self.data_type = data_path.split("/")[-1][:-4]    #TJH strip .tsv from filename appearing after final "/"
         assert self.data_type in ["train", "dev", "test"]
 
@@ -96,10 +96,14 @@ def load_dataset(self, tokenizer):
                 questions = ["<s> "+question for question in questions]
                 answers = ["<s> " +answer for answer in answers]
             question_input = self.tokenizer.batch_encode_plus(questions,
-                                                            pad_to_max_length=True,
-                                                            max_length=self.args.max_input_length)
+                                                              truncation=True,       #TJH added
+                                                              padding='max_length',  #TJH was pad_to_max_length=True,
+                                                              max_length=self.args.max_input_length)
             answer_input = self.tokenizer.batch_encode_plus(answers,
-                                                            pad_to_max_length=True)
+                                                            truncation=True,       #TJH added
+                                                            padding='max_length',  #TJH was pad_to_max_length=True,
+                                                            max_length=self.args.max_input_length)
+
             input_ids, attention_mask = question_input["input_ids"], question_input["attention_mask"]
             decoder_input_ids, decoder_attention_mask = answer_input["input_ids"], answer_input["attention_mask"]
             print ("Finish tokenizering...")