From 3de4de7f5ebddc9d3306f6335d48a2b76e3b7f4f Mon Sep 17 00:00:00 2001 From: Philip Yoon Date: Mon, 22 Jul 2024 20:38:52 -0700 Subject: [PATCH] #925: Verdi infrastructure for submit_pending_jobs worker --- .../modules/common/autoscaling_groups.tf | 5 +++ .../common/launch_template_user_data.sh.tmpl | 6 ++++ .../modules/common/variables.tf | 10 ++++++ data_subscriber/submit_pending_jobs.sh | 31 +++++++++++++++++++ docker/hysds-io.json.submit_pending_jobs | 6 ++++ docker/job-spec.json.submit_pending_jobs | 14 +++++++++ 6 files changed, 72 insertions(+) create mode 100755 data_subscriber/submit_pending_jobs.sh create mode 100644 docker/hysds-io.json.submit_pending_jobs create mode 100644 docker/job-spec.json.submit_pending_jobs diff --git a/cluster_provisioning/modules/common/autoscaling_groups.tf b/cluster_provisioning/modules/common/autoscaling_groups.tf index ce0c4622..497c50d4 100644 --- a/cluster_provisioning/modules/common/autoscaling_groups.tf +++ b/cluster_provisioning/modules/common/autoscaling_groups.tf @@ -64,6 +64,11 @@ resource "aws_cloudwatch_log_group" "run_cslc_download" { retention_in_days = var.lambda_log_retention_in_days } +resource "aws_cloudwatch_log_group" "run_submit_pending_jobs" { + name = "/opera/sds/${var.project}-${var.venue}-${local.counter}/run_submit_pending_jobs.log" + retention_in_days = var.lambda_log_retention_in_days +} + resource "aws_cloudwatch_log_group" "run_batch_query" { name = "/opera/sds/${var.project}-${var.venue}-${local.counter}/run_batch_query.log" retention_in_days = var.lambda_log_retention_in_days diff --git a/cluster_provisioning/modules/common/launch_template_user_data.sh.tmpl b/cluster_provisioning/modules/common/launch_template_user_data.sh.tmpl index ae88dd89..adbc1696 100644 --- a/cluster_provisioning/modules/common/launch_template_user_data.sh.tmpl +++ b/cluster_provisioning/modules/common/launch_template_user_data.sh.tmpl @@ -88,6 +88,12 @@ echo '{ "timezone": "Local", "timestamp_format": "%Y-%m-%d %H:%M:%S,%f" }, + { + "file_path": "/data/work/jobs/**/run_submit_pending_jobs.log", + "log_group_name": "/opera/sds/${var_project}-${var_venue}-${local_counter}/run_submit_pending_jobs.log", + "timezone": "Local", + "timestamp_format": "%Y-%m-%d %H:%M:%S,%f" + }, { "file_path": "/data/work/jobs/**/run_pcm_int.log", "log_group_name": "/opera/sds/${var_project}-${var_venue}-${local_counter}/run_pcm_int.log", diff --git a/cluster_provisioning/modules/common/variables.tf b/cluster_provisioning/modules/common/variables.tf index c5230f86..742429b7 100644 --- a/cluster_provisioning/modules/common/variables.tf +++ b/cluster_provisioning/modules/common/variables.tf @@ -491,6 +491,16 @@ variable "queues" { "total_jobs_metric" = true "use_private_vpc" = false } + "opera-job_worker-submit_pending_jobs" = { + "name" = "opera-job_worker-submit_pending_jobs" + "instance_type" = ["t3a.medium", "t3.medium", "t2.medium", "c6i.large", "t3a.large", "m6a.large", "c6a.large", "c5a.large", "r7i.large", "c7i.large"] + "root_dev_size" = 50 + "data_dev_size" = 25 + "min_size" = 0 + "max_size" = 1 + "total_jobs_metric" = false + "use_private_vpc" = false + } "opera-job_worker-rtc_data_download" = { "name" = "opera-job_worker-rtc_data_download" "instance_type" = ["c6in.large", "c5n.large", "m6in.large", "m5n.large"] diff --git a/data_subscriber/submit_pending_jobs.sh b/data_subscriber/submit_pending_jobs.sh new file mode 100755 index 00000000..e2c0e7b5 --- /dev/null +++ b/data_subscriber/submit_pending_jobs.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +echo "args: $*" + +BASE_PATH=$(dirname "${BASH_SOURCE}") +BASE_PATH=$(cd "${BASE_PATH}"; pwd) + +# source PGE env +export OPERA_HOME=/home/ops/verdi/ops/opera-pcm +export PYTHONPATH=$BASE_PATH:$OPERA_HOME:$PYTHONPATH +export PATH=$BASE_PATH:$PATH +export PYTHONDONTWRITEBYTECODE=1 +export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + +source $HOME/verdi/bin/activate + +echo "##########################################" +echo "Running job to submit pending jobs that are ready to be run" +date + +python $OPERA_HOME/data_subscriber/submit_pending_jobs.py $* > run_submit_pending_jobs.log 2>&1 + +if [ $? -eq 0 ]; then + echo "Finished running job" + date + exit 0 +else + echo "Failed to run submit_pending_jobs.py" + date + exit 1 +fi diff --git a/docker/hysds-io.json.submit_pending_jobs b/docker/hysds-io.json.submit_pending_jobs new file mode 100644 index 00000000..22bc4fd0 --- /dev/null +++ b/docker/hysds-io.json.submit_pending_jobs @@ -0,0 +1,6 @@ +{ + "label": "Evaluate and submit pending jobs", + "submission_type":"individual", + "allowed_accounts": [ "ops" ], + "params": [] +} diff --git a/docker/job-spec.json.submit_pending_jobs b/docker/job-spec.json.submit_pending_jobs new file mode 100644 index 00000000..a5b92a56 --- /dev/null +++ b/docker/job-spec.json.submit_pending_jobs @@ -0,0 +1,14 @@ +{ + "command":"/home/ops/verdi/ops/opera-pcm/data_subscriber/submit_pending_jobs.sh", + "disk_usage":"1GB", + "soft_time_limit": 1800, + "time_limit": 1860, + "imported_worker_files": { + "$HOME/.netrc": "/home/ops/.netrc", + "$HOME/.aws": "/home/ops/.aws", + "$HOME/verdi/etc/settings.yaml": "/home/ops/verdi/ops/opera-pcm/conf/settings.yaml" + }, + "recommended-queues": [ "opera-job_worker-submit_pending_jobs" ], + "post": [ "hysds.triage.triage" ], + "params": [] +}