Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ci logs #3265

Draft
wants to merge 21 commits into
base: master
Choose a base branch
from
Draft

Ci logs #3265

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .github/workflows/benchmark_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@ name: Benchmark torchserve nightly

on:
# run every day at 2:15am
schedule:
- cron: '15 02 * * *'
# schedule:
# - cron: '15 02 * * *'
push:
branches:
- "ci_logs"

jobs:
nightly:
strategy:
fail-fast: false
matrix:
hardware: [cpu, gpu, inf2]
hardware: [gpu] #[cpu, gpu, inf2]
runs-on:
- self-hosted
- ${{ matrix.hardware }}
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/benchmark_torch_compile_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ name: Benchmark torch.compile models nightly
on:
# run every day at 9:15pm
schedule:
- cron: '15 21 * * *'
- cron: '15 02 * * *'
# push:
# branches:
# - "ci_logs"

jobs:
nightly:
Expand Down
27 changes: 21 additions & 6 deletions benchmarks/auto_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
MODEL_JSON_CONFIG_PATH = CWD + "/model_json_config"
BENCHMARK_TMP_PATH = "/tmp/benchmark"
BENCHMARK_REPORT_PATH = "/tmp/ts_benchmark"
BENCHMARK_REPORT_PATH_TEST = "/tmp/ts_benchmark/fail"
TS_LOGS_PATH = CWD + "/logs"
MODEL_STORE = "/tmp/model_store"
WF_STORE = "/tmp/wf_store"
Expand Down Expand Up @@ -136,8 +137,8 @@ def install_torchserve(skip_ts_install, hw, ts_version, nightly):
return

# git checkout branch if it is needed
cmd = "git checkout master && git reset --hard && git clean -dffx . && git pull --rebase"
execute(cmd, wait=True)
# cmd = "git checkout master && git reset --hard && git clean -dffx . && git pull --rebase"
# execute(cmd, wait=True)
print("successfully reset git")

ts_install_cmd = None
Expand All @@ -159,6 +160,7 @@ def install_torchserve(skip_ts_install, hw, ts_version, nightly):
if nightly:
cmd += " --nightly_torch"
execute(cmd, wait=True)

print("successfully install install_dependencies.py")

# install torchserve
Expand Down Expand Up @@ -212,10 +214,23 @@ def run_benchmark(bm_config):
# generate stats metrics from ab_report.csv
bm_model = model_json_config[0 : -len(".json")]

gen_metrics_json.gen_metric(
"{}/ab_report.csv".format(BENCHMARK_TMP_PATH),
"{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH),
)
try:
gen_metrics_json.gen_metric(
"{}/ab_report.csv".format(BENCHMARK_TMP_PATH),
"{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH),
)
except Exception as e:
bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH_TEST, bm_model)
os.makedirs(bm_model_log_path, exist_ok=True)

cmd = "tar -cvzf {}/logs.tar.gz {}".format(
bm_model_log_path, TS_LOGS_PATH
)
execute(cmd, wait=True)

print(f"An error occurred: {e}")
if "report_cmd" in bm_config:
execute(bm_config["report_cmd"], wait=True)

# load stats metrics to remote metrics storage
if "metrics_cmd" in bm_config:
Expand Down
25 changes: 13 additions & 12 deletions benchmarks/benchmark_config_gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
# - nightly: "2022.3.16"
# - release: "0.5.3"
# Nightly build will be installed if "ts_version" is not specifiged
# ts_version:
# branch: &ts_version "master"
ts_version:
branch: &ts_version "ci_logs"


# a list of model configure yaml files defined in benchmarks/models_config
# or a list of model configure yaml files with full path
models:
- "bert_multi_gpu.yaml"
- "bert_multi_gpu_better_transformer.yaml"
- "bert_multi_gpu_no_better_transformer.yaml"
- "fastrcnn.yaml"
- "mnist.yaml"
- "vgg16.yaml"
# - "bert_multi_gpu_no_better_transformer.yaml"
# - "fastrcnn.yaml"
# - "mnist.yaml"
# - "vgg16.yaml"
# - "wf_dog_breed.yaml"

# benchmark on "cpu" or "gpu".
Expand All @@ -28,11 +29,11 @@ hardware: &hardware "gpu"
# - keep the values order as the same as the command definition.
# - set up the command before enabling `metrics_cmd`.
# For example, aws client and AWS credentials need to be setup before trying this example.
metrics_cmd:
- "cmd": "aws cloudwatch put-metric-data"
- "--namespace": ["torchserve_benchmark_nightly_", *hardware]
- "--region": "us-east-2"
- "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
# metrics_cmd:
# - "cmd": "aws cloudwatch put-metric-data"
# - "--namespace": ["torchserve_benchmark_nightly_", *hardware]
# - "--region": "us-east-2"
# - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'

# load report to remote storage or local different path if "report_cmd" is set.
# the command line to load report to remote storage.
Expand All @@ -48,4 +49,4 @@ metrics_cmd:
report_cmd:
- "cmd": "aws s3 cp --recursive"
- "source": '/tmp/ts_benchmark/'
- "dest": ['s3://torchserve-benchmark/nightly', "today()", *hardware]
- "dest": ['s3://torchserve-benchmark/nightly', "today()", "test", *hardware]
18 changes: 9 additions & 9 deletions benchmarks/benchmark_config_torch_compile_gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
# - nightly: "2022.3.16"
# - release: "0.5.3"
# Nightly build will be installed if "ts_version" is not specifiged
#ts_version:
# branch: &ts_version "master"
ts_version:
branch: &ts_version "ci_logs"

# a list of model configure yaml files defined in benchmarks/models_config
# or a list of model configure yaml files with full path
models:
- "bert_torch_compile_gpu.yaml"
- "resnet50_torch_compile_gpu.yaml"
- "sam_fast_torch_compile_gpu_best_latency.yaml"
# - "resnet50_torch_compile_gpu.yaml"
# - "sam_fast_torch_compile_gpu_best_latency.yaml"

# benchmark on "cpu" or "gpu".
# "cpu" is set if "hardware" is not specified
Expand All @@ -24,11 +24,11 @@ hardware: &hardware "gpu"
# - keep the values order as the same as the command definition.
# - set up the command before enabling `metrics_cmd`.
# For example, aws client and AWS credentials need to be setup before trying this example.
metrics_cmd:
- "cmd": "aws cloudwatch put-metric-data"
- "--namespace": ["torchserve_benchmark_nightly_torch_compile_", *hardware]
- "--region": "us-east-2"
- "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
# metrics_cmd:
# - "cmd": "aws cloudwatch put-metric-data"
# - "--namespace": ["torchserve_benchmark_nightly_torch_compile_", *hardware]
# - "--region": "us-east-2"
# - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'

# load report to remote storage or local different path if "report_cmd" is set.
# the command line to load report to remote storage.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
bert_bt:
eager_mode:
benchmark_engine: "ab"
url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-BT.mar
# url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-BT.mar
url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-ERROR.mar
workers:
- 4
batch_delay: 100
Expand Down
1 change: 1 addition & 0 deletions ts_scripts/install_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def install_python_packages(self, cuda_version, requirements_file_path, nightly)
os.system(
f"pip3 install numpy --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/{pt_nightly}"
)

elif args.skip_torch_install:
print("Skipping Torch installation")
else:
Expand Down
Loading