From 87c9823703d3cfae4383afd3920aedd3a5328543 Mon Sep 17 00:00:00 2001
From: udaij12 <32673964+udaij12@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:32:54 -0700
Subject: [PATCH] Adding Graviton Regression test CI (#3273)

* testing on graviton

* testing on graviton

* testing on graviton

* checking python

* rmv python

* changing back to python

* testing cpu instead

* adding torchtext

* adding torchtext

* testing torchtext

* removing two tests

* removing pytorch test

* adding numpy upgrade

* adding numpy upgrade

* testing full ci

* testing full ci

* testing full ci

* skipping grpc

* addign graviton ci

* addign graviton ci

* adding ci cpu graviton

* adding ci cpu graviton

* adding env

* skipping a test for now

* fixing env variable

* removing scripted 3&4

* small changes

* fixing lint

* fixing lint

* fixing lint

* removing torchtext

---------

Co-authored-by: Ubuntu <ubuntu@ip-172-31-26-170.us-west-2.compute.internal>
Co-authored-by: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
---
 .github/workflows/ci_graviton_cpu.yml         | 48 ++++++++++++++++++
 .../regression_tests_graviton_cpu.yml         | 41 ++++++++++++++++
 test/pytest/test_gRPC_inference_api.py        |  5 ++
 test/pytest/test_model_custom_dependencies.py |  5 ++
 test/pytest/test_pytorch_profiler.py          | 49 ++++++++++++++++---
 .../unit_tests/test_object_detector.py        |  5 +-
 6 files changed, 144 insertions(+), 9 deletions(-)
 create mode 100644 .github/workflows/ci_graviton_cpu.yml
 create mode 100644 .github/workflows/regression_tests_graviton_cpu.yml

diff --git a/.github/workflows/ci_graviton_cpu.yml b/.github/workflows/ci_graviton_cpu.yml
new file mode 100644
index 0000000000..e06072ca01
--- /dev/null
+++ b/.github/workflows/ci_graviton_cpu.yml
@@ -0,0 +1,48 @@
+name: CI CPU Graviton
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+  merge_group:
+
+
+concurrency:
+  group: ci-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/master' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  ci-cpu:
+    runs-on: [self-hosted, graviton-test]
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          architecture: arm64
+      - name: Setup Java 17
+        uses: actions/setup-java@v3
+        with:
+          distribution: 'zulu'
+          java-version: '17'
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: Install dependencies
+        run: |
+          python ts_scripts/install_dependencies.py --environment=dev
+      - name: Torchserve Sanity
+        uses: nick-fields/retry@v3
+        env:
+          TS_MAC_ARM64_CPU_ONLY: 'True'
+        with:
+          timeout_minutes: 60
+          max_attempts: 3
+          retry_on: error
+          command: |
+            python torchserve_sanity.py
diff --git a/.github/workflows/regression_tests_graviton_cpu.yml b/.github/workflows/regression_tests_graviton_cpu.yml
new file mode 100644
index 0000000000..39d3a7f600
--- /dev/null
+++ b/.github/workflows/regression_tests_graviton_cpu.yml
@@ -0,0 +1,41 @@
+name: Run Regression Tests on CPU for Graviton
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+  merge_group:
+
+concurrency:
+  group: ci-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/master' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  regression-cpu:
+    runs-on: [self-hosted, graviton-test]
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          architecture: arm64
+      - name: Setup Java 17
+        uses: actions/setup-java@v3
+        with:
+          distribution: 'zulu'
+          java-version: '17'
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: Install dependencies
+        run: |
+          python ts_scripts/install_dependencies.py --environment=dev
+      - name: Torchserve Regression Tests
+        env:
+          TS_MAC_ARM64_CPU_ONLY: 'True'
+        run: |
+          python test/regression_tests.py
diff --git a/test/pytest/test_gRPC_inference_api.py b/test/pytest/test_gRPC_inference_api.py
index 65a6f717b4..21cc38b8c3 100644
--- a/test/pytest/test_gRPC_inference_api.py
+++ b/test/pytest/test_gRPC_inference_api.py
@@ -1,10 +1,12 @@
 import json
 import os
+import platform
 import threading
 from ast import literal_eval
 
 import inference_pb2
 import management_pb2
+import pytest
 import test_gRPC_utils
 import test_utils
 
@@ -50,6 +52,9 @@ def __infer(stub, model_name, model_input):
     return prediction
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64", reason="Test skipped on aarch64 architecture"
+)
 def test_inference_apis():
     with open(os.path.join(os.path.dirname(__file__), inference_data_json), "rb") as f:
         test_data = json.loads(f.read())
diff --git a/test/pytest/test_model_custom_dependencies.py b/test/pytest/test_model_custom_dependencies.py
index e633373be9..5c2e3b1a29 100644
--- a/test/pytest/test_model_custom_dependencies.py
+++ b/test/pytest/test_model_custom_dependencies.py
@@ -1,7 +1,9 @@
 import os
 import pathlib
+import platform
 import subprocess
 
+import pytest
 import requests
 import test_utils
 from model_archiver import ModelArchiver, ModelArchiverConfig
@@ -140,6 +142,9 @@ def register_model_and_make_inference_request(expect_model_load_failure=False):
         resp.raise_for_status()
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64", reason="Test skipped on aarch64 architecture"
+)
 def test_install_dependencies_to_target_directory_with_requirements():
     test_utils.torchserve_cleanup()
 
diff --git a/test/pytest/test_pytorch_profiler.py b/test/pytest/test_pytorch_profiler.py
index 877080c364..e2113fb76a 100644
--- a/test/pytest/test_pytorch_profiler.py
+++ b/test/pytest/test_pytorch_profiler.py
@@ -6,21 +6,28 @@
 import json
 import os
 import pathlib
+import platform
 import shutil
 import subprocess
+from concurrent import futures
 
 import pytest
 import requests
-
 import test_utils
-from concurrent import futures
 
 REPO_ROOT = os.path.normpath(
     os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")
 )
-data_file_mnist = os.path.join(REPO_ROOT, "examples", "image_classifier", "mnist", "test_data", "1.png")
+data_file_mnist = os.path.join(
+    REPO_ROOT, "examples", "image_classifier", "mnist", "test_data", "1.png"
+)
 data_file_resnet = os.path.join(
-    REPO_ROOT, "examples", "image_classifier", "resnet_152_batch", "images", "kitten.jpg"
+    REPO_ROOT,
+    "examples",
+    "image_classifier",
+    "resnet_152_batch",
+    "images",
+    "kitten.jpg",
 )
 data_file_resnet_dog = os.path.join(
     REPO_ROOT, "examples", "image_classifier", "resnet_152_batch", "images", "dog.jpg"
@@ -33,6 +40,9 @@
 
 
 @pytest.fixture
+@pytest.mark.skipif(
+    platform.machine() == "aarch64", reason="Test skipped on aarch64 architecture"
+)
 def set_custom_handler(handler_name):
     """
     This method downloads resnet serialized file, creates mar file and sets up a custom handler
@@ -48,7 +58,8 @@ def set_custom_handler(handler_name):
     serialized_file = os.path.join(test_utils.MODEL_STORE, "resnet152-394f9c45.pth")
     if not os.path.exists(serialized_file):
         response = requests.get(
-            "https://download.pytorch.org/models/resnet152-394f9c45.pth", allow_redirects=True
+            "https://download.pytorch.org/models/resnet152-394f9c45.pth",
+            allow_redirects=True,
         )
         assert response.status_code == 200
         with open(serialized_file, "wb") as f:
@@ -58,10 +69,21 @@ def set_custom_handler(handler_name):
     cmd = test_utils.model_archiver_command_builder(
         model_name="resnet-152-batch",
         version="1.0",
-        model_file=os.path.join(test_utils.CODEBUILD_WD, "examples", "image_classifier", "resnet_152_batch", "model.py"),
+        model_file=os.path.join(
+            test_utils.CODEBUILD_WD,
+            "examples",
+            "image_classifier",
+            "resnet_152_batch",
+            "model.py",
+        ),
         serialized_file=serialized_file,
         handler=handler_name,
-        extra_files=os.path.join(test_utils.CODEBUILD_WD, "examples", "image_classifier", "index_to_name.json"),
+        extra_files=os.path.join(
+            test_utils.CODEBUILD_WD,
+            "examples",
+            "image_classifier",
+            "index_to_name.json",
+        ),
         force=True,
     )
     print(cmd)
@@ -94,6 +116,9 @@ def set_custom_handler(handler_name):
     "handler_name",
     [os.path.join(profiler_utils, "resnet_custom.py"), "image_classifier"],
 )
+@pytest.mark.skipif(
+    platform.machine() == "aarch64", reason="Test skipped on aarch64 architecture"
+)
 def test_profiler_default_and_custom_handler(set_custom_handler, handler_name):
     """
     Tests pytorch profiler integration with default and custom handler
@@ -112,6 +137,9 @@ def test_profiler_default_and_custom_handler(set_custom_handler, handler_name):
     "handler_name",
     [os.path.join(profiler_utils, "resnet_profiler_override.py")],
 )
+@pytest.mark.skipif(
+    platform.machine() == "aarch64", reason="Test skipped on aarch64 architecture"
+)
 def test_profiler_arguments_override(set_custom_handler, handler_name):
     """
     Tests pytorch profiler integration when user overrides the profiler arguments
@@ -133,6 +161,9 @@ def test_profiler_arguments_override(set_custom_handler, handler_name):
     "handler_name",
     [os.path.join(profiler_utils, "resnet_profiler_override.py")],
 )
+@pytest.mark.skipif(
+    platform.machine() == "aarch64", reason="Test skipped on aarch64 architecture"
+)
 def test_batch_input(set_custom_handler, handler_name):
     """
     Tests pytorch profiler integration with batch inference
@@ -146,7 +177,9 @@ def test_batch_input(set_custom_handler, handler_name):
 
     def invoke_batch_input():
         data = open(data_file_resnet, "rb")
-        response = requests.post("{}/predictions/resnet152".format(TF_INFERENCE_API), data)
+        response = requests.post(
+            "{}/predictions/resnet152".format(TF_INFERENCE_API), data
+        )
         assert response.status_code == 200
         assert "tiger_cat" in json.loads(response.content)
 
diff --git a/ts/torch_handler/unit_tests/test_object_detector.py b/ts/torch_handler/unit_tests/test_object_detector.py
index 290643f5ab..f56e27652c 100644
--- a/ts/torch_handler/unit_tests/test_object_detector.py
+++ b/ts/torch_handler/unit_tests/test_object_detector.py
@@ -5,6 +5,7 @@
 Ensures it can load and execute an example model
 """
 
+import platform
 import sys
 from pathlib import Path
 
@@ -56,7 +57,6 @@ def model_dir(tmp_path_factory, model_name):
 
 @pytest.fixture(scope="module")
 def context(model_dir, model_name):
-
     context = MockContext(
         model_name="mnist",
         model_dir=model_dir.as_posix(),
@@ -73,6 +73,9 @@ def handler(context):
     return handler
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64", reason="Test skipped on aarch64 architecture"
+)
 def test_handle(handler, context, image_bytes):
     test_data = [{"data": image_bytes}] * 2
     results = handler.handle(test_data, context)