Skip to content
This repository has been archived by the owner on Mar 3, 2023. It is now read-only.

WIP: Add k8s integration tests to CI #3561

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion heron/config/src/yaml/conf/kubernetes/statemgr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
heron.class.state.manager: org.apache.heron.statemgr.zookeeper.curator.CuratorStateManager

# local state manager connection string
heron.statemgr.connection.string: <zookeeper_host:zookeeper_port>
heron.statemgr.connection.string: 127.0.0.1:2181
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this set to 127.0.0.1 instead of a Kubernetes service name like zookeeper:2181?

If it's set like this only for testing reasons, would it be better to keep it with the placeholder and change to 127.0.0.1:2181 in the kubernetes portion of the test script?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on the host machine zookeeper is available on 127.0.0.1:2181 due to a kubectl port-forward. That helped get the integration test runner further, but I take it this same config is also consumed within the cluster, where it should point to zookeeper:2181?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think it was a place holder for someone configuring Heron to run in Kubernetes. But each of our K8s yamls actually override the setting as a -D parameter passed in to the heron-apiserver command. So I don't think your code change would impact anything beyond it not being immediately apparent that this is no longer a placeholder. I wonder what part of the test scripts need direct Zookeeper access.


# path of the root address to store the state in a local file system
heron.statemgr.root.path: "/heron"
Expand Down
10 changes: 7 additions & 3 deletions integration_test/src/python/test_runner/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import logging
import os
import pkgutil
import random
import re
import sys
import time
import uuid
from http.client import HTTPConnection
from threading import Lock, Thread

Expand Down Expand Up @@ -313,6 +313,7 @@ def run_tests(conf, test_args):
''' Run the test for each topology specified in the conf file '''
lock = Lock()
timestamp = time.strftime('%Y%m%d%H%M%S')
run_fingerprint = f"{timestamp}-{random.randint(0, 2**16):04x}"

http_server_host_port = "%s:%d" % (test_args.http_server_hostname, test_args.http_server_port)

Expand Down Expand Up @@ -361,8 +362,11 @@ def _run_single_test(topology_name, topology_conf, test_args, http_server_host_p
lock.release()

test_threads = []
for topology_conf in test_topologies:
topology_name = ("%s_%s_%s") % (timestamp, topology_conf["topologyName"], str(uuid.uuid4()))
for i, topology_conf in enumerate(test_topologies, 1):
# this name has to be valid for all tested schedullers, state managers, etc.
topology_name = f"run-{run_fingerprint}-test-{i:03}"
# TODO: make sure logs describe the test/topology that fails, as now topology_name is opaque
# topology_conf["topologyName"]
classpath = topology_classpath_prefix + topology_conf["classPath"]

# if the test includes an update we need to pass that info to the topology so it can send
Expand Down
12 changes: 9 additions & 3 deletions integration_test/src/python/topology_test_runner/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import logging
import os
import pkgutil
import random
import re
import sys
import time
import uuid
from http.client import HTTPConnection
from threading import Lock, Thread

Expand Down Expand Up @@ -255,6 +255,8 @@ def __init__(self, topology_name, cluster):
self.state_mgr.start()

def _load_state_mgr(self, cluster):
# this should use cli_config_path (after expanding HOME, or changing default to use ~ instead of $HOME)
# that way can have test copy of config
state_mgr_config = configloader.load_state_manager_locations(cluster, os.getenv("HOME")
+'/.heron/conf/'+cluster
+ '/statemgr.yaml')
Expand Down Expand Up @@ -496,6 +498,7 @@ def run_topology_tests(conf, test_args):
"""
lock = Lock()
timestamp = time.strftime('%Y%m%d%H%M%S')
run_fingerprint = f"{timestamp}-{random.randint(0, 2**16):04x}"

http_server_host_port = "%s:%d" % (test_args.http_hostname, test_args.http_port)

Expand Down Expand Up @@ -562,8 +565,11 @@ def _run_single_test(topology_name, topology_conf, test_args, http_server_host_p
lock.release()

test_threads = []
for topology_conf in test_topologies:
topology_name = ("%s_%s_%s") % (timestamp, topology_conf["topologyName"], str(uuid.uuid4()))
for i, topology_conf in enumerate(test_topologies, 1):
# this name has to be valid for all tested schedullers, state managers, etc.
topology_name = f"run-{run_fingerprint}-test-{i:03}"
# TODO: make sure logs describe the test/topology that fails, as now topology_name is opaque
# topology_conf["topologyName"]
classpath = topology_classpath_prefix + topology_conf["classPath"]

update_args = ""
Expand Down
4 changes: 2 additions & 2 deletions scripts/release/docker-images
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def build_dockerfile(
log_run([str(BUILD_IMAGE), dist, tag, scratch], log)
tar = Path(scratch) / f"heron-docker-{tag}-{dist}.tar.gz"
tar_out = out_dir / tar.name
tar.replace(tar_out)
# using this rather than replace as it works if moving between devices
shutil.move(tar, tar_out)
logging.info("docker image complete: %s", tar_out)
return tar_out

Expand Down Expand Up @@ -107,7 +108,6 @@ def build_target(tag: str, target: str) -> typing.List[Path]:
scratch = Path(tempfile.mkdtemp(prefix=f"build-{target}-"))
log_path = scratch / "log.txt"
log = log_path.open("w")
logging.debug("building %s", target)

try:
tar = build_dockerfile(scratch, target, tag, out_dir, log)
Expand Down
13 changes: 13 additions & 0 deletions scripts/travis/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash
set -o errexit -o nounset -o pipefail

DIR=`dirname $0`
UTILS=${DIR}/../shutils
source ${UTILS}/common.sh

# Autodiscover the platform
PLATFORM=$(discover_platform)
echo "Using $PLATFORM platform"

# install heron locally
bazel --bazelrc=tools/travis/bazel.rc run --config=$PLATFORM -- scripts/packages:heron-install.sh --user
99 changes: 78 additions & 21 deletions scripts/travis/k8s.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
#!/usr/bin/env bash
:<<'DOC'
set NO_CACHE=1 to always rebuild images.
set DEBUG=1 to not clean up

This requires kubectl, and kind.

DOC
set -o errexit -o nounset -o pipefail
TAG=test
HERE="$(cd "$(dirname "$0")"; pwd -P)"
ROOT="$(cd "$HERE/../.."; pwd -P)"
ROOT="$(git rev-parse --show-toplevel)"

CLUSTER_NAME="kubernetes"

API_URL="http://localhost:8001/api/v1/namespaces/default/services/heron-apiserver:9000/proxy"
TRACKER_URL="http://localhost:8001/api/v1/namespaces/default/services/heron-tracker:8888/proxy"
UI_URL="http://localhost:8001/api/v1/namespaces/default/services/heron-ui:8889/proxy"

function bazel_file {
# bazel_file VAR_NAME //some/build:target
Expand All @@ -23,14 +32,6 @@ function kind_images {
docker exec -it kind-control-plane crictl images
}

function install_helm3 {
pushd /tmp
curl --location https://get.helm.sh/helm-v3.2.1-linux-amd64.tar.gz --output helm.tar.gz
tar --extract --file=helm.tar.gz --strip-components=1 linux-amd64/helm
mv helm ~/.local/bin/
popd
}

function action {
(
tput setaf 4;
Expand All @@ -39,12 +40,23 @@ function action {
) > /dev/stderr
}

function clean {
if [ -z "${DEBUG-}" ]; then
action "Cleaning up"
kind delete cluster
kill -TERM -$$
else
action "Not cleaning up due to DEBUG flag being set"
fi
}

function create_cluster {
# trap "kind delete cluster" EXIT
if [ -z "$(kind get clusters)" ]; then
action "Creating kind cluster"
kind create cluster --config="$0.kind.yaml"
else
action "Using existing kind cluster"
fi
}

Expand All @@ -59,34 +71,79 @@ function get_image {
out="$expected"
else
action "Creating heron image"
local gz="$(scripts/release/docker-images build test debian10)"
local gz="$(scripts/release/docker-images build test "$distro")"
# XXX: must un .gz https://github.com/kubernetes-sigs/kind/issues/1636
gzip --decompress "$gz"
out="${gz%%.gz}"
fi
archive="$out"
}

function url_wait {
local seconds="$1"
local url="$2"
local retries=0
tput sc
while ! curl "$url" --location --fail --silent --output /dev/null --write-out "attempt $retries: %{http_code}"
do
retries=$((retries + 1))
if [ $retries -eq 60 ]; then
echo
return 1
fi
sleep 1
tput el1
tput rc
tput sc
done
echo
}

trap clean EXIT
create_cluster

get_image debian10
#get_image debian10
get_image ubuntu20.04
heron_archive="$archive"
action "Loading heron docker image"
kind load image-archive "$heron_archive"
#image_heron="docker.io/bazel/scripts/images:heron"
#image_heron="$heron_image"
image_heron="heron/heron:$TAG"

action "Loading bookkeeper image"
image_bookkeeper="docker.io/apache/bookkeeper:4.7.3"
docker pull "$image_bookkeeper"
kind load docker-image "$image_bookkeeper"

action "Deploying heron with helm"
# install heron in kind using helm
bazel_file helm_yaml //scripts/packages:index.yaml
helm install heron "$(dirname "$helm_yaml")/heron-0.0.0.tgz" \
--set image="$image_heron" \
--set imagePullPolicy=IfNotPresent \
--set bookieReplicas=1 \
--set zkReplicas=1
action "Deploying to kubernetes"
#kubectl create namespace default
kubectl config set-context kind-kind --namespace=default

# deploy
DIR=./deploy/kubernetes/minikube
sed "s#heron/heron:latest#$image_heron#g" ${DIR}/zookeeper.yaml > /tmp/zookeeper.yaml
sed "s#heron/heron:latest#$image_heron#g" ${DIR}/tools.yaml > /tmp/tools.yaml
sed "s#heron/heron:latest#$image_heron#g" ${DIR}/apiserver.yaml > /tmp/apiserver.yaml

kubectl proxy -p 8001 &

kubectl create -f /tmp/zookeeper.yaml
kubectl get pods
kubectl create -f ${DIR}/bookkeeper.yaml
kubectl create -f /tmp/tools.yaml
kubectl create -f /tmp/apiserver.yaml

action "waiting for API server"
url_wait 120 "$API_URL/api/v1/version"
action "waiting for UI"
url_wait 120 "$UI_URL/proxy"

kubectl port-forward service/zookeeper 2181:2181 &

action "API: $API_URL"
action "Tracker: $TRACKER_URL"
action "UI: $UI_URL"
action "Zookeeper: 127.0.0.1:2181"

heron config "$CLUSTER_NAME" set service_url "$API_URL"

"$HERE/test.sh"
2 changes: 2 additions & 0 deletions scripts/travis/k8s.sh.kind.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
# to change kubernetes version, see the tags+digests here: https://hub.docker.com/r/kindest/node/tags
nodes:
- role: control-plane
# image: kindest/node:v1.16.9@sha256:7175872357bc85847ec4b1aba46ed1d12fa054c83ac7a8a11f5c268957fd5765
Loading