Use argument-specific path to facilitate step caching, pass partition…

… algorithm to gconstruct
awslabs · Dec 18, 2024 · 4d266e5 · 4d266e5
1 parent d078d0b
commit 4d266e5
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 50 deletions.
diff --git a/docs/source/cli/graph-construction/single-machine-gconstruct.rst b/docs/source/cli/graph-construction/single-machine-gconstruct.rst
@@ -38,6 +38,7 @@ Full argument list of the ``gconstruct.construct_graph`` command
 * **-\-add-reverse-edges**: boolean value to decide whether to add reverse edges for the given graph. Adding this argument sets it to true; otherwise, it defaults to false. It is **strongly** suggested to include this argument for graph construction, as some nodes in the original data may not have in-degrees, and thus cannot update their presentations by aggregating messages from their neighbors. Adding this arugment helps prevent this issue.
 * **-\-output-format**: the format of constructed graph, options are ``DGL``,  ``DistDGL``.  Default is ``DistDGL``. It also accepts multiple graph formats at the same time separated by an space, for example ``--output-format "DGL DistDGL"``. The output format is explained in the :ref:`Output <gcon-output-format>` section above.
 * **-\-num-parts**: an integer value that specifies the number of graph partitions to produce. This is only valid if the output format is ``DistDGL``.
+* **-\-part-method**: the partition method to use during partitioning. We support 'metis' or 'random'.
 * **-\-skip-nonexist-edges**: boolean value to decide whether skip edges whose endpoint nodes don't exist. Default is true.
 * **-\-ext-mem-workspace**: the directory where the tool can store intermediate data during graph construction. Suggest to use high-speed SSD as the external memory workspace.
 * **-\-ext-mem-feat-size**: the minimal number of feature dimensions that features can be stored in external memory. Default is 64.

diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py
@@ -928,6 +928,7 @@ def process_graph(args):
                            help="The number of graph partitions. " + \
                                    "This is only valid if the output format is DistDGL.")
     argparser.add_argument("--part-method", type=str, default='metis',
+                           choices=['metis', 'random'],
                            help="The partition method. Currently, we support 'metis' and 'random'.")
     argparser.add_argument("--skip-nonexist-edges", action='store_true',
                            help="Skip edges that whose endpoint nodes don't exist.")

diff --git a/sagemaker/pipeline/create_sm_pipeline.py b/sagemaker/pipeline/create_sm_pipeline.py
@@ -24,7 +24,6 @@
 from sagemaker.processing import ScriptProcessor
 from sagemaker.spark.processing import PySparkProcessor
 from sagemaker.pytorch.estimator import PyTorch
-from sagemaker.workflow.execution_variables import ExecutionVariables
 from sagemaker.workflow.functions import Join
 from sagemaker.workflow.parameters import ParameterInteger, ParameterString
 from sagemaker.workflow.pipeline import Pipeline
@@ -71,17 +70,15 @@ def __init__(
         )
 
         # Build up the output prefix
-        # TODO: Using PIPELINE_EXECUTION_ID in the output path invalidates cached results,
-        # maybe have the output path be static between executions (but unique per pipeline)?
-        # One option might be to use a hash of the execution parameters dict and
-        # add that to the prefix?
-        # Could be passed as another parameter to the pipeline
+        # We use a hash of the execution parameters dict and
+        # add that to the prefix to have consistent intermediate paths between executions
+        # that share all the same parameters.
         self.output_subpath = Join(
             on="/",
             values=[
                 self.output_prefix_param,
                 self._get_pipeline_name(args),
-                ExecutionVariables.PIPELINE_EXECUTION_ID,
+                self.execution_subpath_param,
             ],
         )
         self.train_infer_instance = (
@@ -92,9 +89,7 @@ def __init__(
         self.train_infer_image = (
             args.aws_config.graphstorm_pytorch_cpu_image_url
             if self.args.instance_config.train_on_cpu
-            else
-            args.aws_config.graphstorm_pytorch_gpu_image_url
-
+            else args.aws_config.graphstorm_pytorch_gpu_image_url
         )
 
     def _get_or_create_pipeline_session(
@@ -195,6 +190,9 @@ def _create_pipeline_parameters(self, args: PipelineArgs):
             "InstanceVolumeSizeGB",
             args.instance_config.volume_size_gb,
         )
+        self.execution_subpath_param = self._create_string_parameter(
+            "ExecutionSubpath", args.get_hash_hex()
+        )
         self.graphconstruct_config_param = self._create_string_parameter(
             "GraphConstructConfigFile", args.graph_construction_config.config_filename
         )
@@ -310,15 +308,13 @@ def _create_gconstruct_step(self, args: PipelineArgs) -> ProcessingStep:
         gc_local_input_path = "/opt/ml/processing/input"
         # GConstruct should always be the first step and start with the source data
         gc_proc_input = ProcessingInput(
-            source=self.input_data_param,
-            destination=gc_local_input_path,
-            s3_input_mode='File',
+            source=self.input_data_param, destination=gc_local_input_path
         )
         gc_local_output_path = "/opt/ml/processing/output"
         gc_proc_output = ProcessingOutput(
             source=gc_local_output_path,
             destination=gconstruct_s3_output,
-            output_name=self.graph_name_param,
+            output_name=f"{self.graph_name_param}-gconstruct",
         )
 
         gconstruct_arguments = [
@@ -332,6 +328,8 @@ def _create_gconstruct_step(self, args: PipelineArgs) -> ProcessingStep:
             self.graph_name_param,
             "--num-parts",
             self.instance_count_param.to_string(),
+            "--part-method",
+            self.partition_algorithm_param,
         ]
 
         # TODO: Make this a pipeline parameter?
@@ -360,7 +358,6 @@ def _create_gconstruct_step(self, args: PipelineArgs) -> ProcessingStep:
 
     def _create_gsprocessing_step(self, args: PipelineArgs) -> ProcessingStep:
         # Implementation for GSProcessing step
-        # TODO: Add volume size
         pyspark_processor = PySparkProcessor(
             role=args.aws_config.role,
             instance_type=args.instance_config.graph_construction_instance_type,
@@ -400,8 +397,6 @@ def _create_gsprocessing_step(self, args: PipelineArgs) -> ProcessingStep:
             gsprocessing_output,
             "--do-repartition",
             "True",
-            "--add-reverse-edges",
-            "True",
             "--log-level",
             args.task_config.log_level,
         ]
@@ -417,7 +412,7 @@ def _create_gsprocessing_step(self, args: PipelineArgs) -> ProcessingStep:
             destination="/opt/ml/processing/input/data",
         )
         gsprocessing_meta_output = ProcessingOutput(
-            output_name="metadata",
+            output_name="partition-input-metadata",
             destination=gsprocessing_output,
             source="/opt/ml/processing/output",
         )
@@ -522,7 +517,6 @@ def _create_gb_convert_step(self, args: PipelineArgs) -> ProcessingStep:
                     input_name="dist_graph_s3_input",
                     destination="/opt/ml/processing/dist_graph/",
                     source=self.next_step_data_input,
-                    # GraphBolt conversion requires File mode
                     s3_input_mode="File",
                 )
             ],

diff --git a/sagemaker/pipeline/execute_sm_pipeline.py b/sagemaker/pipeline/execute_sm_pipeline.py
@@ -17,7 +17,9 @@
 """
 
 import argparse
+import os
 import sys
+import warnings
 
 import boto3
 import psutil
@@ -40,11 +42,16 @@ def parse_args():
         required=True,
         help="Name of the pipeline to execute. Required.",
     )
-    parser.add_argument("--region", type=str, required=False,
-        help="AWS region. Required for SageMaker execution.")
     parser.add_argument(
-        "--async-execution", action="store_true",
-        help="Run pipeline asynchronously on SageMaker, return after printing execution ARN."
+        "--region",
+        type=str,
+        required=False,
+        help="AWS region. Required for SageMaker execution.",
+    )
+    parser.add_argument(
+        "--async-execution",
+        action="store_true",
+        help="Run pipeline asynchronously on SageMaker, return after printing execution ARN.",
     )
     parser.add_argument(
         "--local-execution",
@@ -60,10 +67,9 @@ def parse_args():
         ),
     )
 
-
     overrides = parser.add_argument_group(
-        "Pipeline overrides",
-        "Override default pipeline parameters at execution time.")
+        "Pipeline overrides", "Override default pipeline parameters at execution time."
+    )
 
     # Optional override parameters
     overrides.add_argument("--instance-count", type=int, help="Override instance count")
@@ -90,7 +96,9 @@ def parse_args():
         help="Override partition algorithm",
     )
     overrides.add_argument("--graph-name", type=str, help="Override graph name")
-    overrides.add_argument("--num-trainers", type=int, help="Override number of trainers")
+    overrides.add_argument(
+        "--num-trainers", type=int, help="Override number of trainers"
+    )
     overrides.add_argument(
         "--use-graphbolt",
         type=str,
@@ -110,6 +118,14 @@ def parse_args():
     overrides.add_argument(
         "--inference-model-snapshot", type=str, help="Override inference model snapshot"
     )
+    overrides.add_argument(
+        "--execution-subpath",
+        type=str,
+        help=(
+            "Override execution subpath. "
+            "By default it's derived from a hash of the input arguments"
+        ),
+    )
 
     return parser.parse_args()
 
@@ -118,15 +134,16 @@ def main():
     """Execute GraphStorm SageMaker pipeline"""
     args = parse_args()
 
+    pipeline_deploy_args = load_pipeline_args(
+        args.pipeline_args_json_file or f"{args.pipeline_name}-pipeline-args.json"
+    )
+    deploy_time_hash = pipeline_deploy_args.get_hash_hex()
+
     if args.local_execution:
         # Use local pipeline and session
-        pipeline_args = load_pipeline_args(
-            args.pipeline_args_json_file or f"{args.pipeline_name}-pipeline-args.json"
-        )
-
         local_session = LocalPipelineSession()
         pipeline_generator = GraphStormPipelineGenerator(
-            pipeline_args, input_session=local_session
+            pipeline_deploy_args, input_session=local_session
         )
         # Set shared memory to half the host's size, as SM does
         instance_mem_mb = int(psutil.virtual_memory().total // (1024 * 1024))
@@ -135,7 +152,7 @@ def main():
         }
         pipeline = pipeline_generator.create_pipeline()
         pipeline.sagemaker_session = local_session
-        pipeline.create(role_arn=pipeline_args.aws_config.role)
+        pipeline.create(role_arn=pipeline_deploy_args.aws_config.role)
     else:
         assert args.region, "Need to provide --region for remote SageMaker execution"
         boto_session = boto3.Session(region_name=args.region)
@@ -147,34 +164,77 @@ def main():
     execution_params = {}
     if args.instance_count is not None:
         execution_params["InstanceCount"] = args.instance_count
+        pipeline_deploy_args.instance_config.train_infer_instance_count = (
+            args.instance_count
+        )
     if args.cpu_instance_type:
         execution_params["CPUInstanceType"] = args.cpu_instance_type
+        pipeline_deploy_args.instance_config.cpu_instance_type = args.cpu_instance_type
     if args.gpu_instance_type:
         execution_params["GPUInstanceType"] = args.gpu_instance_type
+        pipeline_deploy_args.instance_config.gpu_instance_type = args.gpu_instance_type
     if args.graphconstruct_instance_type:
         execution_params["GraphConstructInstanceType"] = (
             args.graphconstruct_instance_type
         )
+        pipeline_deploy_args.instance_config.graph_construction_instance_type = (
+            args.graphconstruct_instance_type
+        )
     if args.graphconstruct_config_file:
         execution_params["GraphConstructConfigFile"] = args.graphconstruct_config_file
+        pipeline_deploy_args.graph_construction_config.config_filename = (
+            args.graphconstruct_config_file
+        )
     if args.partition_algorithm:
         execution_params["PartitionAlgorithm"] = args.partition_algorithm
+        pipeline_deploy_args.partition_config.partition_algorithm = (
+            args.partition_algorithm
+        )
     if args.graph_name:
         execution_params["GraphName"] = args.graph_name
+        pipeline_deploy_args.task_config.graph_name = args.graph_name
     if args.num_trainers is not None:
         execution_params["NumTrainers"] = args.num_trainers
+        pipeline_deploy_args.training_config.num_trainers = args.num_trainers
     if args.use_graphbolt:
         execution_params["UseGraphBolt"] = args.use_graphbolt
+        pipeline_deploy_args.training_config.use_graphbolt_str = args.use_graphbolt
     if args.input_data:
         execution_params["InputData"] = args.input_data
+        pipeline_deploy_args.task_config.input_data_s3 = args.input_data
     if args.output_prefix:
         execution_params["OutputPrefix"] = args.output_prefix
+        pipeline_deploy_args.task_config.output_prefix = args.output_prefix
     if args.train_yaml_file:
         execution_params["TrainConfigFile"] = args.train_yaml_file
+        pipeline_deploy_args.training_config.train_yaml_file = args.train_yaml_file
     if args.inference_yaml_file:
         execution_params["InferenceConfigFile"] = args.inference_yaml_file
+        pipeline_deploy_args.inference_config.inference_yaml_file = (
+            args.inference_yaml_file
+        )
     if args.inference_model_snapshot:
         execution_params["InferenceModelSnapshot"] = args.inference_model_snapshot
+        pipeline_deploy_args.inference_config.inference_model_snapshot = (
+            args.inference_model_snapshot
+        )
+    # If user specified a subpath use that, otherwise let the execution parameters determine it
+    if args.execution_subpath:
+        execution_params["ExecutionSubpath"] = args.execution_subpath
+    else:
+        execution_params["ExecutionSubpath"] = pipeline_deploy_args.get_hash_hex()
+
+        if pipeline_deploy_args.get_hash_hex() != deploy_time_hash:
+            new_prefix = os.path.join(
+                pipeline_deploy_args.task_config.output_prefix,
+                args.pipeline_name,
+                pipeline_deploy_args.get_hash_hex(),
+            )
+            warnings.warn(
+                "The pipeline execution arguments have been modified "
+                "compared to the deployment parameters. "
+                f"This execution will use a new unique output prefix, : {new_prefix}."
+            )
 
     # If no parameters are provided, use an empty dict to use all defaults
     execution = pipeline.start(