Merge branch 'main' into bn-eval

plutonium-239 · Aug 22, 2024 · 1062dd2 · 1062dd2
2 parents 1d85817 + ea97d77
commit 1062dd2
Show file tree

Hide file tree

Showing 12 changed files with 297 additions and 44 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,8 @@
 *.txt
 *.csv
 !requirements.txt
+torchviz-output/
+torchview-output/
 
 # generated docs
 docs_src/_build/

diff --git a/experiments/get_best_results.py b/experiments/get_best_results.py
@@ -7,7 +7,7 @@
 
 import pandas as pd
 
-from experiments.util.collect_results import case_mapping
+from experiments.util.collect_results import case_inv_mapping
 
 
 def main(base_dir: str):
@@ -16,17 +16,25 @@ def main(base_dir: str):
     Args:
         base_dir (str): The base results dir
     """
-    for device, arch in product(["cuda", "cpu"], ["linear", "conv"]):
+    # Don't recognize None as NaN
+    custom_na_values = pd._libs.parsers.STR_NA_VALUES - {"None"}
+    for device, arch in product(["cuda", "cpu"], ["linear", "conv", "transformer"]):
         # usage stats
         df = None
         idx_col = ["model", "case"]
         for fname in glob(os.path.join(base_dir, f"usage_stats-{arch}-{device}-*.csv")):
             with open(fname) as f:
-                f.readline()
-                temp_df = pd.read_csv(f, index_col=idx_col)
+                # f.readline()
+                temp_df = pd.read_csv(
+                    f,
+                    index_col=idx_col,
+                    header=1,
+                    na_values=custom_na_values,
+                    keep_default_na=False,
+                )
             df = temp_df if df is None else pd.concat([df, temp_df])
         if df is not None:
-            df = df.rename(index=case_mapping, level=1)
+            df = df.rename(index=case_inv_mapping, level=1)
             df["Memory Usage (GB)"] = df["Memory Usage (MB)"] / 1024
             df = df.drop(columns=["Memory Usage (MB)"])
             best_results = df.groupby(idx_col).min()

diff --git a/experiments/paper_demo.py b/experiments/paper_demo.py
@@ -52,7 +52,7 @@
 # num_classes = 1000
 # device = "cuda"
 # architecture = "conv"
-# cases = collect_results.select_cases(['All', 'Input', 'Conv', 'Norm', 'SurgicalFirst', 'SurgicalLast'])
+# cases = collect_results.select_cases(['All', 'Input', 'Conv', 'Norm'])
 
 # ============== TRANSFORMER CONFIG ==============
 # Valid choices for models are in models.transformer_model_fns

diff --git a/experiments/util/collect_results.py b/experiments/util/collect_results.py
@@ -27,14 +27,50 @@
     ],
 }
 
-case_mapping = {
-    "None": "All",
-    "grad_input + no_grad_conv_weights + no_grad_conv_bias + no_grad_linear_weights + no_grad_linear_bias + no_grad_norm_weights + no_grad_norm_bias": "Input",
-    "no_grad_linear_weights + no_grad_linear_bias + no_grad_norm_weights + no_grad_norm_bias": "Conv",
-    "no_grad_conv_weights + no_grad_conv_bias + no_grad_linear_weights + no_grad_linear_bias": "Norm",
+cases = {
+    "All": None,  # ALL
+    "Input": [  # INPUT
+        "grad_input",
+        "no_grad_conv_weights",
+        "no_grad_conv_bias",
+        "no_grad_linear_weights",
+        "no_grad_linear_bias",
+        "no_grad_norm_weights",
+        "no_grad_norm_bias",
+    ],
+    "Conv": [  # CONV
+        "no_grad_linear_weights",
+        "no_grad_linear_bias",
+        "no_grad_norm_weights",
+        "no_grad_norm_bias",
+    ],
+    "Linear": [  # LINEAR
+        "no_grad_conv_weights",
+        "no_grad_conv_bias",
+        "no_grad_norm_weights",
+        "no_grad_norm_bias",
+    ],
+    "Norm": [  # NORM
+        "no_grad_conv_weights",
+        "no_grad_conv_bias",
+        "no_grad_linear_weights",
+        "no_grad_linear_bias",
+    ],
 }
 
 
+def select_cases(selected: List[str]) -> List[Union[List[str], None]]:
+    """Helper function to return cases selected by their names
+
+    Args:
+        selected (List[str]): Which cases to select, strings can be keys of the cases table
+
+    Returns:
+        List[Union[List[str], None]]: Selected cases
+    """
+    return [cases[s] for s in selected]
+
+
 def make_case_str(case: Union[None, List[str]]) -> str:
     """Format case into a string
 
@@ -47,6 +83,9 @@ def make_case_str(case: Union[None, List[str]]) -> str:
     return "None" if case is None else " + ".join(case)
 
 
+case_inv_mapping = {make_case_str(v): k for k, v in cases.items()}
+
+
 def hyperparam_str(args: SimpleNamespace) -> str:
     """Format hyperparams into a string
 
@@ -172,12 +211,15 @@ def _display_run(
         """
         # print(f"{model} input ({input_channels},{input_HW},{input_HW}) {device}")
         # print('='*78)
-        s = f"{model} input ({self.batch_size},{self.input_channels},{self.input_HW},{self.input_HW}) {self.device}"
+        if self.architecture == "conv":
+            s = f"{model} input ({self.batch_size},{self.input_channels},{self.input_HW},{self.input_HW}) {self.device}"
+        elif self.architecture == "transformer":
+            s = f"{model} input ({self.batch_size},{self.input_HW},{self.input_channels}(or model hidden size)) {self.device}"
         print(s.center(78, "="))
 
         for out, case in zip(outputs, self.cases):
             print(
-                f"{strings[estimate][1]} ({case_mapping[make_case_str(case)]}): {out:.3f}{strings[estimate][0]}"
+                f"{strings[estimate][1]} ({case_inv_mapping[make_case_str(case)]}): {out:.3f}{strings[estimate][0]}"
             )
 
         # CODE ONLY APPLIES WITH OLD RUNDEMO.PY

diff --git a/experiments/util/estimate.py b/experiments/util/estimate.py
@@ -38,6 +38,8 @@
     "no_grad_norm_bias",
     "grad_input",
     "no_grad_input",
+    "grad_embed_weights",
+    "no_grad_embed_weights",
 ]
 
 
@@ -62,7 +64,10 @@ def parse_case(case: Optional[List[str]]) -> Dict[str, bool]:
 
 
 def skip_case_check(args: argparse.Namespace) -> bool:
-    """Decide whether to skip the case (when case has grad_norm_* but model does not have any normalization layers)
+    """Decide whether to skip the case:
+
+    1. when case has grad_norm_* but model does not have any normalization layers
+    2. when case has no_grad_embed_weights but no grad_input: there is a backward error (no input requires_grad)
 
     Args:
         args (argparse.Namespace): args
@@ -73,12 +78,16 @@ def skip_case_check(args: argparse.Namespace) -> bool:
     invalid = False
     if args.case is None:
         return invalid
+    # 1.
     for c in ["grad_norm_bias", "grad_norm_weights"]:
         if c in args.case and args.model in models.models_without_norm:
             invalid = True
     for c in ["no_grad_norm_bias", "no_grad_norm_weights"]:
         if c not in args.case and args.model in models.models_without_norm:
             invalid = True
+    # 2.
+    if "no_grad_embed_weights" in args.case and "grad_input" not in args.case:
+        invalid = True
     if invalid:
         if args.print:
             print("-1")
@@ -226,7 +235,7 @@ def estimate_mem_savings(
         type=str,
         required=True,
         help="Which architecture to run",
-        choices=["conv", "linear"],
+        choices=["conv", "linear", "transformer", "VLM"],
     )
     parser.add_argument(
         "--estimate",
@@ -275,23 +284,59 @@ def estimate_mem_savings(
             input_shape = (args.input_channels, args.input_hw, args.input_hw)
             models.conv_input_shape = input_shape
             model_fn = models.conv_model_fns.get(args.model)
+            y_args = {"size": (batch_size,), "low": 0, "high": num_classes}
             assert (
                 model_fn is not None
             ), f"Conv model name {args.model} not found, must be one of {list(models.conv_model_fns.keys())}"
         elif args.architecture == "linear":
             input_shape = [args.input_hw**2]
             models.linear_input_shape = input_shape[0]
             model_fn = models.linear_model_fns.get(args.model)
+            y_args = {"size": (batch_size,), "low": 0, "high": num_classes}
             assert (
                 model_fn is not None
             ), f"Linear model name {args.model} not found, must be one of {list(models.linear_model_fns.keys())}"
+        elif args.architecture == "transformer":
+            vocab_dim = args.num_classes
+            embed_dim = args.input_channels
+            seq_len = args.input_hw
+            model_fn = models.transformer_model_fns.get(args.model)
+            if args.model in models.hf_transformers_models:
+                model_fn_orig = model_fn
+                model_fn = lambda: models.TransformersModelWrapper(  # noqa: E731
+                    model_fn_orig, args.model
+                )
+                config = models.get_transformers_config(args.model)
+                # as per transformers.PretrainedConfig these 2 should be present in all models:
+                vocab_dim = config.vocab_size
+                embed_dim = config.hidden_size
+            models.transformer_input_shape = (vocab_dim, embed_dim)
+            input_shape = [seq_len, embed_dim]
+            y_args = {"size": (batch_size, seq_len), "low": 0, "high": vocab_dim}
+            assert (
+                model_fn is not None
+            ), f"Transformer model name {args.model} not found, must be one of {list(models.transformer_model_fns.keys())}"
+        elif args.architecture == "VLM":
+            # model format: `vlm!<vis_model>!<vis_model_arch>!<llm>`
+            # eg:           `vlm!vit!transformer!memsave_gpt2`
+            is_vlm, vis_model, vis_model_arch, llm = args.model.split("!")
+            assert is_vlm == "vlm"
+            assert vis_model_arch in ["transformer", "conv"]
+            model_fn = lambda: models.VLM(vis_model, vis_model_arch, llm)  # noqa: E731
+            config = models.get_transformers_config(llm)
+            vocab_dim = config.vocab_size
+            embed_dim = config.hidden_size
+            seq_len = (args.input_hw // 16) ** 2
+            y_args = {"size": (batch_size, seq_len), "low": 0, "high": vocab_dim}
+            input_shape = (args.input_channels, args.input_hw, args.input_hw)
+            models.conv_input_shape = input_shape
 
         loss_fn = CrossEntropyLoss
 
         manual_seed(0)  # make deterministic
 
         x = rand(batch_size, *input_shape, device=dev)
-        y = randint(size=(batch_size,), low=0, high=num_classes, device=dev)
+        y = randint(**y_args, device=dev)
         targets = None
         if args.model in models.detection_models:
             # pred is a dictionary of losses