From 1e7a96007b8ed08c7df415d9445442aa148b73b9 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Mon, 21 Nov 2016 13:42:37 -0500 Subject: [PATCH 1/3] spec: add support for provisioning clusters --- sample.redhat-ci.yml | 44 ++++++++++++++++++++-------- utils/ext_schema.py | 68 ++++++++++++++++++++++++++++++++++---------- utils/schema.yml | 10 +++++++ 3 files changed, 95 insertions(+), 27 deletions(-) diff --git a/sample.redhat-ci.yml b/sample.redhat-ci.yml index 02327aa..866840b 100644 --- a/sample.redhat-ci.yml +++ b/sample.redhat-ci.yml @@ -1,11 +1,5 @@ -# The current format is very simple and may change in the -# future as more features get added, though we will try to -# maintain backwards compatibility (or help projects migrate -# to the new format). - -# REQUIRED (one of 'host' or 'container') -# All details about the host to provision go under the host -# key, though for now, 'distro' is the only handled child: +# REQUIRED (only one of 'host' or 'container' or 'cluster') +# Provision a single host. host: # REQUIRED # Specify the distro to provision. More options will be @@ -41,12 +35,36 @@ host: # omitted, the latest commit is used. revision: 7.145.42 -# REQUIRED (one of 'host' or 'container') +# REQUIRED (only one of 'host' or 'container' or 'cluster') +# Provision a container. container: # REQUIRED # Specify an FQIN or Docker Hub image. image: fedora:24 +# REQUIRED (only one of 'host' or 'container' or 'cluster') +# Provision multiple hosts. +cluster: + # REQUIRED + # List of hosts to provision. The same keys as above are + # accepted. + hosts: + # REQUIRED + # Node hostname. Also makes the environment variable + # $RHCI_{name}_IP available (with dots and dashes + # replaced by underscores). + - name: host1 + distro: centos/7/atomic + ostree: latest + - name: host2 + distro: fedora/24/cloud + # OPTIONAL + # If specified, the scripts are run on this container. + # If omitted, the scripts are run on the first host + # listed in the 'hosts' list. + container: + image: fedora:24 + # OPTIONAL # List the branches to test. If omitted, only the master # branch is tested. @@ -111,6 +129,7 @@ build: tests: - make check - make installcheck + - ansible-playbook -i host1,$RHCI_host2_IP, playbook.yml # OPTIONAL # Time to allow before aborting tests. Must satisfy regex @@ -143,9 +162,10 @@ context: 'My other testsuite' # To unset an inherited key, simply leave off its value. artifacts: -# As a convenience, specifying 'host' automatically unsets -# 'container' if it was inherited (and vice-versa), so that -# there is no need to explicitly unset it. +# As a convenience, specifying one of the 'host', +# 'container' or 'cluster' keys automatically unsets the +# other two if inherited, so that there is no need to +# explicitly unset them. host: distro: fedora/24/atomic diff --git a/utils/ext_schema.py b/utils/ext_schema.py index 3c97c24..e07dff5 100644 --- a/utils/ext_schema.py +++ b/utils/ext_schema.py @@ -1,17 +1,51 @@ +import re import utils.common as common from pykwalify.core import Core from pykwalify.errors import SchemaError + +# http://stackoverflow.com/questions/2532053/ +def _valid_hostname(hostname): + if len(hostname) > 253: + return False + if re.match(r"[\d.]+$", hostname): + return False + allowed = re.compile("(?!-)[A-Z\d-]{1,63}(? 1: + raise SchemaError("only one of 'host', 'container', " + "or 'cluster' required") if 'build' not in value and 'tests' not in value: raise SchemaError("at least one of 'build' or 'tests' required") return True + +def ext_hosts(value, rule_obj, path): + # Until this is fixed: + # https://github.com/Grokzen/pykwalify/issues/67 + if type(value) is not list: + raise SchemaError("expected list of dicts") + for i, host in enumerate(value): + if type(host) is not dict: + raise SchemaError("host %d is not a dict" % i) + if 'name' not in host: + raise SchemaError("host %d missing key 'name'" % i) + if not _valid_hostname(host['name']): + raise SchemaError("invalid hostname for host %d" % i) + if 'ostree' in host: + ext_ostree(host['ostree'], rule_obj, path) + return True + + def ext_repos(value, rule_obj, path): # Until this is fixed: # https://github.com/Grokzen/pykwalify/issues/67 @@ -24,41 +58,45 @@ def ext_repos(value, rule_obj, path): raise SchemaError("repo %d missing key 'name'" % i) for key in repo: if type(repo[key]) not in [int, str]: - raise SchemaError("key '%s' of repo %d is not str or int" % (key, i)) + raise SchemaError("key '%s' of repo %d is not str or int" + % (key, i)) return True + def ext_ostree(value, rule_obj, path): if type(value) is str: if value != "latest": raise SchemaError("expected string 'latest'") elif type(value) is dict: - schema = { 'mapping': - { 'remote': { 'type': 'str' }, - 'branch': { 'type': 'str' }, - 'revision' : { 'type': 'str' } + schema = {'mapping': + {'remote': {'type': 'str'}, + 'branch': {'type': 'str'}, + 'revision': {'type': 'str'} } - } + } c = Core(source_data=value, schema_data=schema) c.validate() else: raise SchemaError("expected str or map") return True + def ext_timeout(value, rule_obj, path): if common.str_to_timeout(value) > (2 * 60 * 60): raise SchemaError("timeout cannot be greater than 2 hours") return True + def ext_build(value, rule_obj, path): if type(value) not in [dict, bool]: raise SchemaError("expected bool or map") if type(value) is dict: - schema = { 'mapping': - { 'config-opts': { 'type': 'str' }, - 'build-opts': { 'type': 'str' }, - 'install-opts': { 'type': 'str' } + schema = {'mapping': + {'config-opts': {'type': 'str'}, + 'build-opts': {'type': 'str'}, + 'install-opts': {'type': 'str'} } - } + } c = Core(source_data=value, schema_data=schema) c.validate() return True diff --git a/utils/schema.yml b/utils/schema.yml index 3c043e8..279df73 100644 --- a/utils/schema.yml +++ b/utils/schema.yml @@ -14,6 +14,16 @@ mapping: image: type: str required: true + cluster: + mapping: + hosts: + type: any + func: ext_hosts + container: + mapping: + image: + type: str + required: true context: type: str required: true From fc5bb1b85be8444ce53ac8968d8343b34092f184 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Mon, 21 Nov 2016 13:45:07 -0500 Subject: [PATCH 2/3] parser: parse out cluster entries --- utils/parser.py | 90 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/utils/parser.py b/utils/parser.py index cf23474..c331e18 100755 --- a/utils/parser.py +++ b/utils/parser.py @@ -5,7 +5,6 @@ # integrate pieces of the pipeline in here. E.g. # provisioning, prereqs, test runs, etc... -import re import os import sys import yaml @@ -95,40 +94,66 @@ def _validate(suite, contexts): contexts.append(suite['context']) -def flush_suite(suite, outdir): +def write_to_file(dir, fn, s): + with open(os.path.join(dir, fn), 'w') as f: + f.write(s) + + +def flush_host(host, outdir): + if 'ostree' in host: + val = host['ostree'] + assert type(val) in [str, dict] + if type(val) is str: + assert val == "latest" + write_to_file(outdir, "ostree_revision", "") + else: + write_to_file(outdir, "ostree_remote", val.get('remote', '')) + write_to_file(outdir, "ostree_branch", val.get('branch', '')) + write_to_file(outdir, "ostree_revision", val.get('revision', '')) + write_to_file(outdir, "distro", host['distro']) - def write_to_file(fn, s): - with open(os.path.join(outdir, fn), 'w') as f: - f.write(s) + +def flush_suite(suite, outdir): os.makedirs(outdir) if 'host' in suite: - host = suite['host'] - if 'ostree' in host: - val = host['ostree'] - assert type(val) in [str, dict] - if type(val) is str: - assert val == "latest" - write_to_file("ostree_revision", "") - else: - write_to_file("ostree_remote", val.get('remote', '')) - write_to_file("ostree_branch", val.get('branch', '')) - write_to_file("ostree_revision", val.get('revision', '')) - write_to_file("distro", host['distro']) + dir = os.path.join(outdir, "host") + os.mkdir(dir) + flush_host(suite['host'], dir) + write_to_file(outdir, 'envtype', 'host') + write_to_file(outdir, 'controller', 'host') if 'container' in suite: - write_to_file("image", suite['container']['image']) + write_to_file(outdir, "image", suite['container']['image']) + write_to_file(outdir, 'envtype', 'container') + write_to_file(outdir, 'controller', 'container') + + if 'cluster' in suite: + cluster = suite['cluster'] + for i, host in enumerate(cluster['hosts']): + dir = os.path.join(outdir, "host-%d" % i) + os.mkdir(dir) + flush_host(host, dir) + write_to_file(dir, "name", host['name']) + write_to_file(outdir, 'nhosts', str(i+1)) + if 'container' in cluster: + write_to_file(outdir, "image", cluster['container']['image']) + write_to_file(outdir, 'controller', 'container') + else: + write_to_file(outdir, 'controller', 'host') + write_to_file(outdir, 'envtype', 'cluster') if 'tests' in suite: - write_to_file("tests", '\n'.join(suite['tests'])) + write_to_file(outdir, "tests", '\n'.join(suite['tests'])) - write_to_file("branches", '\n'.join(suite.get('branches', ['master']))) + write_to_file(outdir, "branches", + '\n'.join(suite.get('branches', ['master']))) timeout = common.str_to_timeout(suite.get('timeout', '2h')) - write_to_file("timeout", str(timeout)) + write_to_file(outdir, "timeout", str(timeout)) - write_to_file("context", suite.get('context')) + write_to_file(outdir, "context", suite.get('context')) if 'extra-repos' in suite: repos = '' @@ -137,32 +162,35 @@ def write_to_file(fn, s): for key, val in repo.items(): repos += "%s=%s\n" % (key, val) if repos != "": - write_to_file("rhci-extras.repo", repos) + write_to_file(outdir, "rhci-extras.repo", repos) if 'packages' in suite: packages = [] for pkg in suite['packages']: packages.append(shlex.quote(pkg)) - write_to_file("packages", ' '.join(packages)) + write_to_file(outdir, "packages", ' '.join(packages)) if 'artifacts' in suite: - write_to_file("artifacts", '\n'.join(suite['artifacts'])) + write_to_file(outdir, "artifacts", '\n'.join(suite['artifacts'])) if 'env' in suite: envs = '' for k, v in suite['env'].items(): envs += 'export %s=%s\n' % (k, shlex.quote(v)) - write_to_file("envs", envs) + write_to_file(outdir, "envs", envs) if 'build' in suite: v = suite['build'] if type(v) is bool and v: - write_to_file("build", '') + write_to_file(outdir, "build", '') elif type(v) is dict: - write_to_file("build", '') - write_to_file("build.config_opts", v.get('config-opts', '')) - write_to_file("build.build_opts", v.get('build-opts', '')) - write_to_file("build.install_opts", v.get('install-opts', '')) + write_to_file(outdir, "build", '') + write_to_file(outdir, "build.config_opts", + v.get('config-opts', '')) + write_to_file(outdir, "build.build_opts", + v.get('build-opts', '')) + write_to_file(outdir, "build.install_opts", + v.get('install-opts', '')) if __name__ == '__main__': From 640d6e3e3ce21a0ae511776cd6a3b2bc2075af72 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Mon, 21 Nov 2016 13:46:03 -0500 Subject: [PATCH 3/3] testrunner: implement cluster provisioning support We break out the provisioning part into its own script so that we can more easily provision multiple hosts simultaneously. We also make sure that SSH is all set up so that users can just directly start SSH'ing between nodes. --- Dockerfile | 4 - main | 2 + provisioner | 133 +++++++++++++++++ spawner.py | 1 + testrunner | 339 +++++++++++++++++++++++------------------- utils/common.py | 2 + utils/common.sh | 46 ++++++ utils/os_provision.py | 31 ++-- utils/user-data | 1 - 9 files changed, 389 insertions(+), 170 deletions(-) create mode 100755 provisioner diff --git a/Dockerfile b/Dockerfile index 4ec6249..2034a6f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,10 +50,6 @@ LABEL RUN="/usr/bin/docker run --rm --privileged \ \${OPT1} \ \${IMAGE}" -# When run in e.g. Jenkins, it's really annoying to not see -# any output of e.g. the provisioner until it's all done. -ENV PYTHONUNBUFFERED 1 - COPY . /redhat-ci RUN pip3 install -r /redhat-ci/requirements.txt diff --git a/main b/main index 75eccec..6662120 100755 --- a/main +++ b/main @@ -38,6 +38,8 @@ main() { exit 0 fi + export PYTHONUNBUFFERED=1 + exec $THIS_DIR/spawner.py } diff --git a/provisioner b/provisioner new file mode 100755 index 0000000..eec0ed4 --- /dev/null +++ b/provisioner @@ -0,0 +1,133 @@ +#!/bin/bash +set -Exeuo pipefail + +# This script provisions a node on OpenStack. It may be +# called multiple times in parallel. + +THIS_DIR=$(dirname $0) + +source $THIS_DIR/utils/common.sh + +main() { + + # NB: see the various NBs in the main() of main. + + state=$1; shift + parsedhost=$1; shift + outdir=$1; shift + + [ -d $parsedhost ] + mkdir $outdir + + provision_host +} + +provision_host() { + + # XXX: We hardcode m1.small for now, but these really + # should be specified indirectly from the .redhat-ci + # YAML file through e.g. min-* vars. + env \ + os_image="$(cat $parsedhost/distro)" \ + os_flavor=m1.small \ + os_name_prefix=github-ci-testnode \ + os_user_data="$THIS_DIR/utils/user-data" \ + "$THIS_DIR/utils/os_provision.py" $outdir + + ssh_wait $(cat $outdir/node_addr) $state/node_key + + if [ -f $parsedhost/ostree_revision ]; then + if ! on_atomic_host; then + update_github error "Cannot specify 'ostree' on non-AH." + touch $state/exit # signal testrunner to exit nicely + exit 0 + fi + deploy_ostree + fi +} + +deploy_ostree() { + local remote=$(cat $parsedhost/ostree_remote) + local branch=$(cat $parsedhost/ostree_branch) + local revision=$(cat $parsedhost/ostree_revision) + + local rc=0 + local skip_reboot=0 + if [ -z "$remote" ] && [ -z "$branch" ]; then + + if [ -z "$revision" ]; then + vmssh rpm-ostree upgrade --upgrade-unchanged-exit-77 || rc=$? + else + vmssh rpm-ostree deploy "$revision" || rc=$? + fi + + if [ $rc == 77 ]; then + skip_reboot=1 + elif [ $rc != 0 ]; then + update_github error "Failed to upgrade or deploy." + touch $state/exit # signal testrunner to exit nicely + exit 0 + fi + else + local refspec + + if [ -n "$remote" ]; then + vmssh ostree remote add --no-gpg-verify rhci "$remote" + refspec=rhci: + fi + + if [ -n "$branch" ]; then + refspec="${refspec}$branch" + fi + + if vmssh rpm-ostree rebase "$refspec"; then + update_github error "Failed to rebase onto refspec." + touch $state/exit # signal testrunner to exit nicely + exit 0 + fi + + if [ -n "$revision" ]; then + # we should really be able to do this in a single step + # https://github.com/projectatomic/rpm-ostree/issues/212 + vmreboot + vmssh rpm-ostree deploy "$revision" || rc=$? + + if [ $rc == 77 ]; then + skip_reboot=1 + elif [ $rc != 0 ]; then + update_github error "Failed to upgrade or deploy." + touch $state/exit # signal testrunner to exit nicely + exit 0 + fi + fi + fi + + if [ $skip_reboot != 1 ]; then + vmreboot + fi +} + +update_github() { + local context=$(cat $state/parsed/context) + common_update_github "$context" "$@" +} + +vmssh() { + ssh -q -n -i $state/node_key \ + -o StrictHostKeyChecking=no \ + -o PasswordAuthentication=no \ + -o UserKnownHostsFile=/dev/null \ + root@$(cat $outdir/node_addr) "$@" +} + +vmreboot() { + vmssh systemctl reboot || : + sleep 3 # give time for port to go down + ssh_wait $(cat $outdir/node_addr) $state/node_key +} + +on_atomic_host() { + vmssh test -f /run/ostree-booted +} + +main "$@" diff --git a/spawner.py b/spawner.py index 6782776..6e713fe 100755 --- a/spawner.py +++ b/spawner.py @@ -100,6 +100,7 @@ def read_pipe(idx, fd): while s != b'': if not s.endswith(b'\n'): s += b'\n' + # pylint: disable=no-member sys.stdout.buffer.write((b'[%d] ' % idx) + s) s = fd.readline() diff --git a/testrunner b/testrunner index cdcb443..0a511b3 100755 --- a/testrunner +++ b/testrunner @@ -10,19 +10,10 @@ source $THIS_DIR/utils/common.sh main() { - # NB: bash trickery: don't use any of the function calls - # in if-statements, it will completely disable set -e - # inside the function... Yet another reason to port this - # to Python. - - # NB2: if you need to change directory, do it in a - # subshell. - - # NB3: the use of eval is strictly forbidden. Never - # directly run a user-provided variable. + # NB: see the various NBs in the main() of main. # We take a single argument; the state dir index to use. - # But we still expect the global state dir to be the + # But we still expect the global state dir to be in the # $PWD. state_idx=$1; shift @@ -52,8 +43,14 @@ provision_env() { ensure_teardown_container provision_container else - ensure_teardown_node - provision_node + ssh_setup_key + if clustered; then + ensure_teardown_cluster + provision_cluster + else + ensure_teardown_node + provision_node + fi fi } @@ -74,38 +71,49 @@ provision_container() { provision_node() { - # the allowed fields for "distro" are the same as the image name in glance - local image=$(cat $state/parsed/distro) - - update_github pending "Provisioning test node..." + update_github pending "Provisioning host..." # substitute node to use in ":" format if [ -n "${RHCI_DEBUG_USE_NODE:-}" ]; then - echo "${RHCI_DEBUG_USE_NODE%:*}" > $state/node_name - echo "${RHCI_DEBUG_USE_NODE#*:}" > $state/node_addr + echo "${RHCI_DEBUG_USE_NODE%:*}" > $state/host/node_name + echo "${RHCI_DEBUG_USE_NODE#*:}" > $state/host/node_addr else - # XXX: We hardcode m1.small for now, but these really - # should be specified indirectly from the .redhat-ci - # YAML file through e.g. min-* vars. - env \ - os_image="$image" \ - os_flavor=m1.small \ - os_name_prefix=github-ci-testnode \ - os_user_data="$THIS_DIR/utils/user-data" \ - "$THIS_DIR/utils/os_provision.py" $state + $THIS_DIR/provisioner $state $state/parsed/host $state/host + if [ -f $state/exit ]; then + # the provisioner encountered a user error and already updated GH + exit 0 + fi fi +} - ssh_setup_key +provision_cluster() { + local nhosts=$(cat $state/parsed/nhosts) - ssh_wait + update_github pending "Provisioning cluster..." - if [ -f $state/parsed/ostree_revision ]; then - if ! on_atomic_host; then - update_github error "Cannot specify 'ostree' on non-AH." - exit 0 - fi - deploy_ostree + seq 0 $((nhosts - 1)) | xargs -P 0 -n 1 -I {} \ + $THIS_DIR/provisioner $state $state/parsed/host-{} $state/host-{} + + if [ -f $state/exit ]; then + # a provisioner encountered a user error and already updated GH + exit 0 fi + + if container_controlled; then + provision_container + else + # make the first host the controller + ln -s host-0 $state/host + fi + + local i=0 + while [ $i -lt $nhosts ]; do + local name=$(cat $state/parsed/host-$i/name) + local addr=$(cat $state/host-$i/node_addr) + name=$(sed 's/[.-]/_/g' <<< "$name") + echo "export RHCI_${name}_IP=${addr}" >> $state/parsed/envs + i=$((i + 1)) + done } prepare_env() { @@ -127,6 +135,10 @@ prepare_env() { fi fi + if clustered; then + ssh_setup_cluster + fi + envcmd mkdir /var/tmp/checkout envcp checkouts/$github_repo/. /var/tmp/checkout } @@ -138,97 +150,77 @@ ssh_setup_key() { set -x } -ssh_wait() { - local node_addr=$(cat $state/node_addr) - - timeout 120s "$THIS_DIR/utils/sshwait" $node_addr - - # We have to be extra cautious here -- OpenStack - # networking takes some time to settle, so we wait until - # we can contact the node for 5 continuous seconds. - - local max_sleep=30 - local failed=1 - - sustain_true() { - local sustain=5 - while [ $sustain -gt 0 ]; do - if ! vmssh true; then - return 1 - fi - sustain=$((sustain - 1)) - max_sleep=$((max_sleep - 1)) - sleep 1 - done - failed=0 - } - - while ! sustain_true && [ $max_sleep -gt 0 ]; do - max_sleep=$((max_sleep - 1)) - sleep 1 +ssh_setup_cluster() { + local nhosts=$(cat $state/parsed/nhosts) + + # since the common case is to interact with the various + # nodes by ssh, let's make sure it's all set up nicely + # ahead of time + + # let's go through the hosts once to collect keys + local i=0 + while [ $i -lt $nhosts ]; do + local name=$(cat $state/parsed/host-$i/name) + local addr=$(cat $state/host-$i/node_addr) + ssh-keyscan $addr 2>/dev/null | \ + sed "s/^/$name,/" >> $state/known_hosts + echo $addr $name >> $state/hosts + i=$((i + 1)) done - unset -f sustain_true - - if [ $failed == 1 ]; then - echo "ERROR: Timed out while waiting for SSH." - return 1 + # We use a different key than the one used to provision + # the nodes here, since we don't want to expose the + # private key of the OpenStack keypair used. NB: not in + # a state dir; we don't want to regen a key on every + # run. + if [ ! -f cluster_keypair/id_rsa ]; then + # let's just stick with RSA since it's supported on + # all platforms + mkdir -p cluster_keypair + ssh-keygen -t rsa -b 4096 -N "" -f cluster_keypair/id_rsa fi -} - -deploy_ostree() { - local remote=$(cat $state/parsed/ostree_remote) - local branch=$(cat $state/parsed/ostree_branch) - local revision=$(cat $state/parsed/ostree_revision) - - skip_reboot=0 - if [ -z "$remote" ] && [ -z "$branch" ]; then - - rc=0 - if [ -z "$revision" ]; then - vmssh rpm-ostree upgrade --upgrade-unchanged-exit-77 || rc=$? - else - vmssh rpm-ostree deploy "$revision" || rc=$? - fi - - if [ $rc == 77 ]; then - skip_reboot=1 - elif [ $rc != 0 ]; then - update_github error "Failed to upgrade or deploy." - exit 0 - fi - else - local refspec - if [ -n "$remote" ]; then - vmssh ostree remote add --no-gpg-verify rhci "$remote" - refspec=rhci: + if container_controlled; then + # most base images don't have ssh + if ! envcmd [ -x /bin/ssh ]; then + env_make_cache + envcmd yum install -y openssh-clients fi + envcmd mkdir -m 0600 /root/.ssh + envcp cluster_keypair/id_rsa /root/.ssh + envcp $state/known_hosts /root/.ssh + envcp $state/hosts /etc/hosts.append + envcmd sh -c "cat /etc/hosts.append >> /etc/hosts" + fi - if [ -n "$branch" ]; then - refspec="${refspec}$branch" - fi - - vmssh rpm-ostree rebase "$refspec" - - if [ -n "$revision" ]; then - # we should really be able to do this in a single step - # https://github.com/projectatomic/rpm-ostree/issues/212 - vmreboot - vmssh rpm-ostree deploy "$revision" || rc=$? + vmipssh() { + ip=$1; shift + ssh -q -i $state/node_key \ + -o StrictHostKeyChecking=no \ + -o PasswordAuthentication=no \ + -o UserKnownHostsFile=/dev/null \ + root@$ip "$@" + } - if [ $rc == 77 ]; then - skip_reboot=1 - elif [ $rc != 0 ]; then - update_github error "Failed to upgrade or deploy." - exit 0 - fi - fi - fi + i=0 + while [ $i -lt $nhosts ]; do + local name=$(cat $state/parsed/host-$i/name) + local addr=$(cat $state/host-$i/node_addr) + + # some of these could be redone more cleanly through + # cloud-init, though the dynamic aspect would + # probably end up making it look similar + vmipssh $addr hostnamectl set-hostname $name + vmipssh $addr "cat >> /etc/hosts" < $state/hosts + vmipssh $addr "cat >> /root/.ssh/known_hosts" < $state/known_hosts + vmipssh $addr "cat > /root/.ssh/id_rsa" < cluster_keypair/id_rsa + vmipssh $addr chmod 0400 /root/.ssh/id_rsa + vmipssh $addr "cat >> /root/.ssh/authorized_keys" \ + < cluster_keypair/id_rsa.pub + i=$((i + 1)) + done - if [ $skip_reboot != 1 ]; then - vmreboot - fi + unset -f vmipssh } overlay_packages() { @@ -255,16 +247,7 @@ install_packages() { mgr=dnf fi - # This is hacky and sad. Preemptively try to refresh - # yum/dnf cache to work around flaky distro infras. - - local retries=5 - while [ $retries -gt 0 ]; do - if envcmd $mgr makecache; then - break - fi - retries=$((retries - 1)) - done + env_make_cache local rc=0 logged_envcmd $upload_dir/setup.log / - - \ @@ -277,6 +260,20 @@ install_packages() { fi } +env_make_cache() { + + # This is hacky and sad. Preemptively try to refresh + # yum/dnf cache to work around flaky distro infras. + + local retries=5 + while [ $retries -gt 0 ]; do + if envcmd yum makecache; then + break + fi + retries=$((retries - 1)) + done +} + run_loop() { local timeout=$1; shift local logfile=$1; shift @@ -370,8 +367,8 @@ fetch_artifacts() { mkdir $upload_dir/artifacts local fetched_at_least_one=0 - if ! containerized; then - local node_addr=$(cat $state/node_addr) + if host_controlled; then + local node_addr=$(cat $state/host/node_addr) while IFS='' read -r artifact || [[ -n $artifact ]]; do path="/var/tmp/checkout/$artifact" @@ -542,18 +539,18 @@ timed_envcmd() { # effectively do a 'sh -c' on the passed command, which # means that quoting might be an issue. - if containerized; then + if container_controlled; then local cid=$(cat $state/cid) sudo timeout --signal=KILL $timeout \ docker exec $cid "$@" else - local node_addr=$(cat $state/node_addr) + local node_addr=$(cat $state/host/node_addr) timeout --signal=KILL $timeout \ ssh -q -n -i $state/node_key \ -o StrictHostKeyChecking=no \ -o PasswordAuthentication=no \ -o UserKnownHostsFile=/dev/null \ - root@$(cat $state/node_addr) "$@" + root@$node_addr "$@" fi } @@ -572,11 +569,11 @@ envcp() { # Also, rsync creates nonexistent dirs, whereas docker # does not, so explicitly mkdir beforehand. - if containerized; then + if container_controlled; then local cid=$(cat $state/cid) sudo docker cp $target $cid:$remote else - local node_addr=$(cat $state/node_addr) + local node_addr=$(cat $state/host/node_addr) rsync --quiet -az --no-owner --no-group \ -e "ssh -q -i $state/node_key \ -o StrictHostKeyChecking=no \ @@ -593,7 +590,7 @@ vmssh() { -o StrictHostKeyChecking=no \ -o PasswordAuthentication=no \ -o UserKnownHostsFile=/dev/null \ - root@$(cat $state/node_addr) "$@" + root@$(cat $state/host/node_addr) "$@" } vmscp() { @@ -606,7 +603,7 @@ vmscp() { vmreboot() { vmssh systemctl reboot || : sleep 3 # give time for port to go down - ssh_wait + ssh_wait $(cat $state/host/node_addr) $state/node_key } update_github() { @@ -619,20 +616,26 @@ ensure_err_github_update() { } teardown_node() { + teardown_node_impl \ + $(cat $state/host/node_name) \ + $(cat $state/host/node_addr) +} - if [ -f $state/node_name ]; then - - local node_name=$(cat $state/node_name) - local node_addr=$(cat $state/node_addr) +teardown_node_impl() { + local node_name=$1; shift + local node_addr=$1; shift - if [ -f $state/node_addr ] && \ - [ -n "${os_floating_ip_pool:-}" ]; then - nova floating-ip-disassociate $node_name $node_addr - nova floating-ip-delete $node_addr - fi + if [ -z "$node_name" ]; then + return + fi - nova delete $(cat $state/node_name) + if [ -n "$node_addr" ] && \ + [ -n "${os_floating_ip_pool:-}" ]; then + nova floating-ip-disassociate $node_name $node_addr + nova floating-ip-delete $node_addr fi + + nova delete $node_name } ensure_teardown_node() { @@ -653,8 +656,46 @@ ensure_teardown_container() { fi } +teardown_cluster() { + local nhosts=$(cat $state/parsed/nhosts) + + local i=0 + while [ $i -lt $nhosts ]; do + teardown_node_impl \ + $(cat $state/host-$i/node_name) \ + $(cat $state/host-$i/node_addr) + i=$((i + 1)) + done + + if container_controlled; then + teardown_container + fi +} + +ensure_teardown_cluster() { + if [ -z "${RHCI_DEBUG_NO_TEARDOWN:-}" ]; then + trap teardown_cluster EXIT + fi +} + containerized() { - [ -f $state/parsed/image ] + [ "$(cat $state/parsed/envtype)" = container ] +} + +virtualized() { + [ "$(cat $state/parsed/envtype)" = host ] +} + +clustered() { + [ "$(cat $state/parsed/envtype)" = cluster ] +} + +container_controlled() { + [ "$(cat $state/parsed/controller)" = container ] +} + +host_controlled() { + [ "$(cat $state/parsed/controller)" = host ] } on_atomic_host() { diff --git a/utils/common.py b/utils/common.py index d3f81e0..4f25300 100644 --- a/utils/common.py +++ b/utils/common.py @@ -1,5 +1,6 @@ import re + # http://stackoverflow.com/a/39596504/308136 def ordinal(n): suffix = ['th', 'st', 'nd', 'rd', 'th', 'th', 'th', 'th', 'th', 'th'] @@ -14,6 +15,7 @@ def ordinal(n): return str(n) + s + # normalize timeout str to seconds def str_to_timeout(s): assert re.match('^[0-9]+[smh]$', s) diff --git a/utils/common.sh b/utils/common.sh index 820f467..9526419 100644 --- a/utils/common.sh +++ b/utils/common.sh @@ -46,3 +46,49 @@ common_update_github() { --url "$url" fi } + +# Block until a node is available through SSH +# $1 node IP address +# $2 private key +ssh_wait() { + local node_addr=$1; shift + local node_key=$1; shift + + timeout 120s "$THIS_DIR/utils/sshwait" $node_addr + + # We have to be extra cautious here -- OpenStack + # networking takes some time to settle, so we wait until + # we can contact the node for 5 continuous seconds. + + local max_sleep=30 + local failed=1 + + sustain_true() { + local sustain=5 + while [ $sustain -gt 0 ]; do + if ! ssh -q -n -i $node_key \ + -o StrictHostKeyChecking=no \ + -o PasswordAuthentication=no \ + -o UserKnownHostsFile=/dev/null \ + root@$node_addr true; then + return 1 + fi + sustain=$((sustain - 1)) + max_sleep=$((max_sleep - 1)) + sleep 1 + done + failed=0 + } + + while ! sustain_true && [ $max_sleep -gt 0 ]; do + max_sleep=$((max_sleep - 1)) + sleep 1 + done + + unset -f sustain_true + + if [ $failed == 1 ]; then + echo "ERROR: Timed out while waiting for SSH." + return 1 + fi +} diff --git a/utils/os_provision.py b/utils/os_provision.py index 6f0d2dc..bd6e056 100755 --- a/utils/os_provision.py +++ b/utils/os_provision.py @@ -1,21 +1,20 @@ #!/usr/bin/env python3 -# This script is not meant to be run manually. It is called -# from the main script. - -# Assumes the usual OpenStack authentication env vars are -# defined. - -# Expects the following env vars: -# os_image -# os_flavor -# os_keyname -# os_network -# os_user_data -# os_name_prefix -# os_floating_ip_pool (optional) - -# See the README for details. +''' + This script is not meant to be run manually. It is + called from the main script. See the README for details. + + We assume that the usual OpenStack authentication env + vars are defined. Addtionally, the following env vars + are expected: + - os_image + - os_flavor + - os_keyname + - os_network + - os_user_data + - os_name_prefix + - os_floating_ip_pool (optional) +''' import os import sys diff --git a/utils/user-data b/utils/user-data index 490e363..4b336e4 100644 --- a/utils/user-data +++ b/utils/user-data @@ -1,6 +1,5 @@ #cloud-config disable_root: 0 -ssh_pwauth: 1 users: - name: root