Orquesta CI #3150
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We run orquesta integration tests as part of a separate workflow. | |
# Orquesta tests have a lot of race conditions which result in intermediate failures and timeouts. | |
# Utilizing separate workflow allows us to re-run just this workflow / job on failure instead of | |
# wasting time and resources by needing to re-run all the jobs. | |
name: Orquesta CI | |
on: | |
push: | |
branches: | |
# only on merges to master branch | |
- master | |
# and version branches, which only include minor versions (eg: v3.4) | |
- v[0-9]+.[0-9]+ | |
tags: | |
# also version tags, which include bugfix releases (eg: v3.4.0) | |
- v[0-9]+.[0-9]+.[0-9]+ | |
pull_request: | |
type: [opened, reopened, edited] | |
branches: | |
# Only for PRs targeting those branches | |
- master | |
- v[0-9]+.[0-9]+ | |
schedule: | |
# run every night at midnight | |
- cron: '0 0 * * *' | |
jobs: | |
# TODO: Fix the required checks! | |
# When the pre_job triggers and skips builds, it prevents merging the PR because | |
# the required checks are reported as skipped instead of passed. | |
# Special job which automatically cancels old runs for the same branch, prevents runs for the | |
# same file set which has already passed, etc. | |
pre_job: | |
name: Skip Duplicate Jobs Pre Job | |
runs-on: ubuntu-20.04 | |
outputs: | |
should_skip: ${{ steps.skip_check.outputs.should_skip }} | |
steps: | |
- id: skip_check | |
uses: fkirc/skip-duplicate-actions@4c656bbdb6906310fa6213604828008bc28fe55d # v3.3.0 | |
with: | |
cancel_others: 'true' | |
github_token: ${{ github.token }} | |
integration-tests: | |
needs: pre_job | |
# NOTE: We always want to run job on master since we run some additional checks there (code | |
# coverage, etc) | |
# if: ${{ needs.pre_job.outputs.should_skip != 'true' || github.ref == 'refs/heads/master' }} | |
name: '${{ matrix.name }} - Python ${{ matrix.python-version-short }}' | |
runs-on: ubuntu-20.04 | |
strategy: | |
fail-fast: false | |
matrix: | |
# NOTE: We need to use full Python version as part of Python deps cache key otherwise | |
# setup virtualenv step will fail. | |
include: | |
- name: 'Integration Tests (Orquesta)' | |
task: 'ci-orquesta' | |
nosetests_node_total: 1 | |
nosetests_node_index: 0 | |
python-version: '3.6.13' | |
python-version-short: '3.6' | |
- name: 'Integration Tests (Orquesta)' | |
task: 'ci-orquesta' | |
nosetests_node_total: 1 | |
nosetests_node_index: 0 | |
python-version-short: '3.8' | |
python-version: '3.8.10' | |
services: | |
mongo: | |
image: mongo:4.4 | |
ports: | |
- 27017:27017 | |
rabbitmq: | |
image: rabbitmq:3.8-management | |
options: >- | |
--name rabbitmq | |
ports: | |
- 5671:5671/tcp # AMQP SSL port | |
- 5672:5672/tcp # AMQP standard port | |
- 15672:15672/tcp # Management: HTTP, CLI | |
# Used for the coordination backend for integration tests | |
# NOTE: To speed things up, we only start redis for integration tests | |
# where it's needed | |
# redis: | |
# # Docker Hub image | |
# image: redis | |
# # Set health checks to wait until redis has started | |
# options: >- | |
# --name "redis" | |
# --health-cmd "redis-cli ping" | |
# --health-interval 10s | |
# --health-timeout 5s | |
# --health-retries 5 | |
# ports: | |
# - 6379:6379/tcp | |
env: | |
TASK: '${{ matrix.task }}' | |
NODE_TOTAL: '${{ matrix.nosetests_node_total }}' | |
NODE_INDEX: '${{ matrix.nosetests_node_index }}' | |
# We need to explicitly specify terminal width otherwise some CLI tests fail on container | |
# environments where small terminal size is used. | |
COLUMNS: '120' | |
# CI st2.conf (with ST2_CI_USER user instead of stanley) | |
ST2_CONF: 'conf/st2.ci.conf' | |
# Tell StackStorm that we are indeed in CI mode, previously we hard coded a Travis specific | |
# environment variable in our test code, making it a PITA when we switch CI providers. | |
# Now, we simply set this environment varible here in the CI portion of our testing and | |
# it avoids any CI provider type lock-in. | |
ST2_CI: 'true' | |
# Name of the user who is running the CI (on GitHub Actions this is 'runner') | |
ST2_CI_USER: 'runner' | |
# GitHub is juggling how to set vars for multiple shells. Protect our PATH assumptions. | |
PATH: /home/runner/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v2 | |
- name: Custom Environment Setup | |
run: | | |
./scripts/github/setup-environment.sh | |
- name: 'Set up Python (${{ matrix.python-version }})' | |
uses: actions/setup-python@v2 | |
with: | |
python-version: '${{ matrix.python-version }}' | |
- name: Cache Python Dependencies | |
uses: actions/cache@v2 | |
with: | |
path: | | |
~/.cache/pip | |
virtualenv | |
~/virtualenv | |
# TODO: maybe make the virtualenv a partial cache to exclude st2*? | |
# !virtualenv/lib/python*/site-packages/st2* | |
# !virtualenv/bin/st2* | |
key: ${{ runner.os }}-v4-python-${{ matrix.python-version }}-${{ hashFiles('requirements.txt', 'test-requirements.txt') }} | |
# Don't use alternative key as if requirements.txt has altered we | |
# don't want to retrieve previous cache | |
#restore-keys: | | |
# ${{ runner.os }}-v4-python-${{ matrix.python }}- | |
- name: Cache APT Dependencies | |
id: cache-apt-deps | |
uses: actions/cache@v2 | |
with: | |
path: | | |
~/apt_cache | |
key: ${{ runner.os }}-apt-v7-${{ hashFiles('scripts/github/apt-packages.txt') }} | |
restore-keys: | | |
${{ runner.os }}-apt-v7- | |
- name: Install APT Depedencies | |
env: | |
CACHE_HIT: ${{steps.cache-apt-deps.outputs.cache-hit}} | |
run: | | |
# install dev dependencies for Python YAML and LDAP packages | |
# https://github.com/StackStorm/st2-auth-ldap | |
./scripts/github/install-apt-packages-use-cache.sh | |
- name: Install virtualenv | |
run: | | |
./scripts/github/install-virtualenv.sh | |
- name: Install requirements | |
run: | | |
./scripts/ci/install-requirements.sh | |
- name: Setup Integration Tests | |
run: | | |
# prep a ci-specific dev conf file that uses runner instead of stanley | |
# this user is the username of the user in GitHub actions, used for SSH, etc during | |
# integration tests (important) | |
cp conf/st2.dev.conf "${ST2_CONF}" ; sed -i -e "s/stanley/${ST2_CI_USER}/" "${ST2_CONF}" | |
sudo -E ./scripts/ci/add-itest-user-key.sh | |
- name: Run Redis Service Container | |
timeout-minutes: 2 | |
run: | | |
docker run --rm --detach -p 127.0.0.1:6379:6379/tcp --name redis redis:latest | |
until [ "$(docker inspect -f {{.State.Running}} redis)" == "true" ]; do sleep 0.1; done | |
- name: Permissions Workaround | |
run: | | |
echo "$ST2_CI_REPO_PATH" | |
sudo ST2_CI_REPO_PATH="${ST2_CI_REPO_PATH}" scripts/ci/permissions-workaround.sh | |
- name: Print versions | |
run: | | |
./scripts/ci/print-versions.sh | |
- name: make | |
timeout-minutes: 31 | |
env: | |
MAX_ATTEMPTS: 3 | |
RETRY_DELAY: 5 | |
# use: script -e -c to print colors | |
run: | | |
# There is a race in some orequesta integration tests so they tend to fail quite often. | |
# To avoid needed to re-run whole workflow in such case, we should try to retry this | |
# specific step. This saves us a bunch of time manually re-running the whole workflow. | |
# TODO: Try to identify problematic tests (iirc mostly orquesta ones) and only retry / | |
# re-run those. | |
set +e | |
for i in $(seq 1 ${MAX_ATTEMPTS}); do | |
echo "Attempt: ${i}/${MAX_ATTEMPTS}" | |
script -e -c "timeout 10m make ${TASK}" && exit 0 | |
exit_code=$? | |
echo "Command failed / timed out (exit_code=${exit_code}), will retry in ${RETRY_DELAY} seconds..." | |
sleep ${RETRY_DELAY} | |
done | |
set -e | |
echo "Failed after ${MAX_ATTEMPTS} attempts, failing the job." | |
exit 1 | |
- name: Upload StackStorm services Logs | |
#if: ${{ failure() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: logs | |
path: logs/ | |
- name: Codecov | |
# NOTE: We only generate and submit coverage report for master and version branches and only when the build succeeds (default on GitHub Actions, this was not the case on Travis so we had to explicitly check success) | |
if: "${{ success() && env.ENABLE_COVERAGE == 'yes' }}" | |
run: | | |
./scripts/ci/submit-codecov-coverage.sh | |
- name: Compress Service Logs Before upload | |
if: ${{ failure() }} | |
run: | | |
tar cvzpf logs.tar.gz logs/* | |
- name: Upload StackStorm services Logs | |
if: ${{ failure() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: logs | |
path: logs.tar.gz | |
retention-days: 7 | |
- name: Stop Redis Service Container | |
if: "${{ always() }}" | |
run: docker rm --force redis || true | |
slack-notification: | |
name: Slack notification for failed master builds | |
if: always() | |
needs: | |
- integration-tests | |
runs-on: ubuntu-20.04 | |
steps: | |
- name: Workflow conclusion | |
# this step creates an environment variable WORKFLOW_CONCLUSION and is the most reliable way to check the status of previous jobs | |
uses: technote-space/workflow-conclusion-action@v2 | |
- name: CI Run Failure Slack Notification | |
if: ${{ env.WORKFLOW_CONCLUSION == 'failure' && github.ref == 'refs/heads/master' }} | |
env: | |
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} | |
uses: voxmedia/github-action-slack-notify-build@v1 | |
with: | |
channel: development | |
status: FAILED | |
color: danger | |
# HELPER FOR FUTURE DEVELOPERS: | |
# If your GitHub Actions job is failing and you need to debug it, by default there is | |
# no way to SSH into the container. | |
# The step below can be uncommeted and will stop here and allow you to SSH in. | |
# When this step is reached, simply refresh the GitHub Actions output for this build | |
# and this SSH command will be printed every 5 seconds to the output. | |
# Once you are done debugging in your SSH session, simply: touch /continue | |
# and this will continue the build. | |
# | |
# - name: Setup tmate session for debugging failed jobs (allows SSH into the container) | |
# uses: mxschmitt/action-tmate@v3 | |
# if: "${{ failure() }}" | |
# |