pytorch · udaij12 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
@@ -2,15 +2,18 @@ name: Benchmark torchserve nightly
 
 on:
   # run every day at 2:15am
-  schedule:
-    - cron:  '15 02 * * *'
+  # schedule:
+  #   - cron:  '15 02 * * *'
+  push:
+    branches:
+      - "ci_logs"
 
 jobs:
   nightly:
     strategy:
       fail-fast: false
       matrix:
-        hardware: [cpu, gpu, inf2]
+        hardware: [gpu] #[cpu, gpu, inf2]
     runs-on:
       - self-hosted
       - ${{ matrix.hardware }}

diff --git a/.github/workflows/benchmark_torch_compile_nightly.yml b/.github/workflows/benchmark_torch_compile_nightly.yml
@@ -3,7 +3,10 @@ name: Benchmark torch.compile models nightly
 on:
   # run every day at 9:15pm
   schedule:
-    - cron:  '15 21 * * *'
+    - cron:  '15 02 * * *'
+  # push:
+  #   branches:
+  #     - "ci_logs"
 
 jobs:
   nightly:

diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py
@@ -11,6 +11,7 @@
 MODEL_JSON_CONFIG_PATH = CWD + "/model_json_config"
 BENCHMARK_TMP_PATH = "/tmp/benchmark"
 BENCHMARK_REPORT_PATH = "/tmp/ts_benchmark"
+BENCHMARK_REPORT_PATH_TEST = "/tmp/ts_benchmark/fail"
 TS_LOGS_PATH = CWD + "/logs"
 MODEL_STORE = "/tmp/model_store"
 WF_STORE = "/tmp/wf_store"
@@ -136,8 +137,8 @@ def install_torchserve(skip_ts_install, hw, ts_version, nightly):
         return
 
     # git checkout branch if it is needed
-    cmd = "git checkout master && git reset --hard && git clean -dffx . && git pull --rebase"
-    execute(cmd, wait=True)
+    # cmd = "git checkout master && git reset --hard && git clean -dffx . && git pull --rebase"
+    # execute(cmd, wait=True)
     print("successfully reset git")
 
     ts_install_cmd = None
@@ -159,6 +160,7 @@ def install_torchserve(skip_ts_install, hw, ts_version, nightly):
     if nightly:
         cmd += " --nightly_torch"
     execute(cmd, wait=True)
+
     print("successfully install install_dependencies.py")
 
     # install torchserve
@@ -212,10 +214,23 @@ def run_benchmark(bm_config):
             # generate stats metrics from ab_report.csv
             bm_model = model_json_config[0 : -len(".json")]
 
-            gen_metrics_json.gen_metric(
-                "{}/ab_report.csv".format(BENCHMARK_TMP_PATH),
-                "{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH),
-            )
+            try:
+                gen_metrics_json.gen_metric(
+                    "{}/ab_report.csv".format(BENCHMARK_TMP_PATH),
+                    "{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH),
+                )
+            except Exception as e:
+                bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH_TEST, bm_model)
+                os.makedirs(bm_model_log_path, exist_ok=True)
+
+                cmd = "tar -cvzf {}/logs.tar.gz {}".format(
+                    bm_model_log_path, TS_LOGS_PATH
+                )
+                execute(cmd, wait=True)
+
+                print(f"An error occurred: {e}")
+                if "report_cmd" in bm_config:
+                    execute(bm_config["report_cmd"], wait=True)
 
             # load stats metrics to remote metrics storage
             if "metrics_cmd" in bm_config:

diff --git a/benchmarks/benchmark_config_gpu.yaml b/benchmarks/benchmark_config_gpu.yaml
@@ -3,18 +3,19 @@
 #  - nightly: "2022.3.16"
 #  - release: "0.5.3"
 # Nightly build will be installed if "ts_version" is not specifiged
-# ts_version:
-#     branch: &ts_version "master"
+ts_version:
+   branch: &ts_version "ci_logs"
+
 
 # a list of model configure yaml files defined in benchmarks/models_config
 # or a list of model configure yaml files with full path
 models:
     - "bert_multi_gpu.yaml"
     - "bert_multi_gpu_better_transformer.yaml"
-    - "bert_multi_gpu_no_better_transformer.yaml"
-    - "fastrcnn.yaml"
-    - "mnist.yaml"
-    - "vgg16.yaml"
+    # - "bert_multi_gpu_no_better_transformer.yaml"
+    # - "fastrcnn.yaml"
+    # - "mnist.yaml"
+    # - "vgg16.yaml"
 #    - "wf_dog_breed.yaml"
 
 # benchmark on "cpu" or "gpu".
@@ -28,11 +29,11 @@ hardware: &hardware "gpu"
 #    - keep the values order as the same as the command definition.
 #    - set up the command before enabling `metrics_cmd`.
 #      For example, aws client and AWS credentials need to be setup before trying this example.
-metrics_cmd:
-    - "cmd": "aws cloudwatch put-metric-data"
-    - "--namespace": ["torchserve_benchmark_nightly_", *hardware]
-    - "--region": "us-east-2"
-    - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
+# metrics_cmd:
+#     - "cmd": "aws cloudwatch put-metric-data"
+#     - "--namespace": ["torchserve_benchmark_nightly_", *hardware]
+#     - "--region": "us-east-2"
+#     - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
 
 # load report to remote storage or local different path if "report_cmd" is set.
 # the command line to load report to remote storage.
@@ -48,4 +49,4 @@ metrics_cmd:
 report_cmd:
     - "cmd": "aws s3 cp --recursive"
     - "source": '/tmp/ts_benchmark/'
-    - "dest": ['s3://torchserve-benchmark/nightly', "today()", *hardware]
+    - "dest": ['s3://torchserve-benchmark/nightly', "today()", "test", *hardware]
diff --git a/benchmarks/benchmark_config_torch_compile_gpu.yaml b/benchmarks/benchmark_config_torch_compile_gpu.yaml
@@ -3,15 +3,15 @@
 #  - nightly: "2022.3.16"
 #  - release: "0.5.3"
 # Nightly build will be installed if "ts_version" is not specifiged
-#ts_version:
-#    branch: &ts_version "master"
+ts_version:
+   branch: &ts_version "ci_logs"
 
 # a list of model configure yaml files defined in benchmarks/models_config
 # or a list of model configure yaml files with full path
 models:
     - "bert_torch_compile_gpu.yaml"
-    - "resnet50_torch_compile_gpu.yaml"
-    - "sam_fast_torch_compile_gpu_best_latency.yaml"
+    # - "resnet50_torch_compile_gpu.yaml"
+    # - "sam_fast_torch_compile_gpu_best_latency.yaml"
 
 # benchmark on "cpu" or "gpu".
 # "cpu" is set if "hardware" is not specified
@@ -24,11 +24,11 @@ hardware: &hardware "gpu"
 #    - keep the values order as the same as the command definition.
 #    - set up the command before enabling `metrics_cmd`.
 #      For example, aws client and AWS credentials need to be setup before trying this example.
-metrics_cmd:
-    - "cmd": "aws cloudwatch put-metric-data"
-    - "--namespace": ["torchserve_benchmark_nightly_torch_compile_", *hardware]
-    - "--region": "us-east-2"
-    - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
+# metrics_cmd:
+#     - "cmd": "aws cloudwatch put-metric-data"
+#     - "--namespace": ["torchserve_benchmark_nightly_torch_compile_", *hardware]
+#     - "--region": "us-east-2"
+#     - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
 
 # load report to remote storage or local different path if "report_cmd" is set.
 # the command line to load report to remote storage.

diff --git a/benchmarks/models_config/bert_multi_gpu_better_transformer.yaml b/benchmarks/models_config/bert_multi_gpu_better_transformer.yaml
@@ -2,7 +2,8 @@
 bert_bt:
     eager_mode:
         benchmark_engine: "ab"
-        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-BT.mar
+        # url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-BT.mar
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-ERROR.mar
         workers:
             - 4
         batch_delay: 100

diff --git a/ts_scripts/install_dependencies.py b/ts_scripts/install_dependencies.py
@@ -140,6 +140,7 @@ def install_python_packages(self, cuda_version, requirements_file_path, nightly)
             os.system(
                 f"pip3 install numpy --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/{pt_nightly}"
             )
+
         elif args.skip_torch_install:
             print("Skipping Torch installation")
         else: