Skip to content

Commit

Permalink
Implement an integration test for recovery after a network cut (#116)
Browse files Browse the repository at this point in the history
* Refactor scale down logic to not force remove unreachable instances

* Add unit tests

* Address lint warnings

* Add sleep and idle timeout in test code in an attempt to stabilize tests

* Address lint warnings

* Add sleep after killing pod to give test environment time to stabilize

* Wait for unit to be removed from cluster after force removing it

* Wait for forcefully removed unit to actually be removed from the cluster

* Add integration test for recovery after a network cut

* Remove duplicate code introduced after merging main

* Install helm3 in CI runner for self healing integration tests

* Fix failing unit test

* Pin flake8 to v5.0.4 due to incompatibilities with v6

* Install helm on CI runners using sudo

* Address PR feedback; move chaos_mesh installation to fixture
  • Loading branch information
shayancanonical authored Dec 4, 2022
1 parent 0d8ea2d commit 592a333
Show file tree
Hide file tree
Showing 10 changed files with 344 additions and 74 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ jobs:
# This is needed until https://bugs.launchpad.net/juju/+bug/1977582 is fixed
channel: 1.23/stable
bootstrap-options: "--agent-version 2.9.29"
- name: Install Helm
run: /usr/bin/sudo snap install helm3
- name: Run self healing integration tests
run: tox -e integration-self-healing

Expand Down
135 changes: 73 additions & 62 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
from constants import (
CLUSTER_ADMIN_PASSWORD_KEY,
CLUSTER_ADMIN_USERNAME,
CONFIGURED_FILE,
CONTAINER_NAME,
MYSQLD_CONFIG_FILE,
MYSQLD_SERVICE,
PASSWORD_LENGTH,
PEER,
Expand Down Expand Up @@ -251,7 +251,49 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
"app", required_password, generate_random_password(PASSWORD_LENGTH)
)

def _on_mysql_pebble_ready(self, event):
def _configure_instance(self, container) -> None:
"""Configure the instance for use in Group Replication."""
try:
# Run mysqld for the first time to
# bootstrap the data directory and users
logger.debug("Initializing instance")
self._mysql.initialise_mysqld()

# Add the pebble layer
logger.debug("Adding pebble layer")
container.add_layer(MYSQLD_SERVICE, self._pebble_layer, combine=False)
container.restart(MYSQLD_SERVICE)

logger.debug("Waiting for instance to be ready")
self._mysql.wait_until_mysql_connection()

logger.info("Configuring instance")
# Configure all base users and revoke privileges from the root users
self._mysql.configure_mysql_users()
# Configure instance as a cluster node
self._mysql.configure_instance()

self.unit_peer_data["unit-configured"] = "True"
except (
MySQLConfigureInstanceError,
MySQLConfigureMySQLUsersError,
MySQLInitialiseMySQLDError,
MySQLCreateCustomConfigFileError,
) as e:
logger.debug("Unable to configure instance: {}".format(e))
return False

try:
# Set workload version
workload_version = self._mysql.get_mysql_version()
self.unit.set_workload_version(workload_version)
except MySQLGetMySQLVersionError:
# Do not block the charm if the version cannot be retrieved
pass

return True

def _on_mysql_pebble_ready(self, event) -> None:
"""Pebble ready handler.
Define and start a pebble service and bootstrap instance.
Expand All @@ -264,83 +306,52 @@ def _on_mysql_pebble_ready(self, event):

container = event.workload

if container.exists(CONFIGURED_FILE):
# When reusing a volume
# Configure the layer when changed
if not container.exists(MYSQLD_CONFIG_FILE):
self._mysql.create_custom_config_file(
report_host=self.get_unit_hostname(self.unit.name)
)

if self.unit_peer_data.get("unit-configured"):
# Only update pebble layer if unit is already configured for GR
current_layer = container.get_plan()
new_layer = self._pebble_layer

if new_layer.services != current_layer:
logger.info("Add pebble layer")
logger.info("Adding pebble layer")

container.add_layer(MYSQLD_SERVICE, new_layer, combine=True)
container.restart(MYSQLD_SERVICE)
self._mysql.wait_until_mysql_connection()

self.unit.status = ActiveStatus()
return

# First run setup
self.unit.status = MaintenanceStatus("Initialising mysqld")
try:

# Run mysqld for the first time to
# bootstrap the data directory and users
logger.debug("Initialising instance")
self._mysql.initialise_mysqld()

# Create custom server config file
logger.debug("Create custom config")
self._mysql.create_custom_config_file(
report_host=self.get_unit_hostname(self.unit.name)
)

# Add the pebble layer
container.add_layer(MYSQLD_SERVICE, self._pebble_layer, combine=False)
container.restart(MYSQLD_SERVICE)
logger.debug("Waiting for instance to be ready")
self._mysql.wait_until_mysql_connection()
logger.info("Configuring instance")
# Configure all base users and revoke
# privileges from the root users
self._mysql.configure_mysql_users()
# Configure instance as a cluster node
self._mysql.configure_instance()
# set workload version
workload_version = self._mysql.get_mysql_version()
self.unit.set_workload_version(workload_version)

except (
MySQLConfigureInstanceError,
MySQLConfigureMySQLUsersError,
MySQLInitialiseMySQLDError,
MySQLCreateCustomConfigFileError,
) as e:
# First run setup
if not self._configure_instance(container):
self.unit.status = BlockedStatus("Unable to configure instance")
logger.debug("Unable to configure instance: {}".format(e))
return
except MySQLGetMySQLVersionError:
# Do not block the charm if the version cannot be retrieved
pass

if self.unit.is_leader():
try:
# Create the cluster when is the leader unit
unit_label = self.unit.name.replace("/", "-")
self._mysql.create_cluster(unit_label)
logger.debug("Cluster configured on unit")
# Create control file in data directory
container.push(CONFIGURED_FILE, make_dirs=True, source="configured")
self.app_peer_data["units-added-to-cluster"] = "1"
self.unit_peer_data["unit-initialized"] = "True"
self.unit.status = ActiveStatus()
except MySQLCreateClusterError as e:
self.unit.status = BlockedStatus("Unable to create cluster")
logger.debug("Unable to create cluster: {}".format(e))
else:
# When unit is not the leader, it should wait
# for the leader to configure it a cluster node
if not self.unit.is_leader():
# Non-leader units should wait for leader to add them to the cluster
self.unit.status = WaitingStatus("Waiting for instance to join the cluster")
return

try:
# Create the cluster when is the leader unit
logger.info("Creating cluster on the leader unit")
unit_label = self.unit.name.replace("/", "-")
self._mysql.create_cluster(unit_label)

# Create control file in data directory
container.push(CONFIGURED_FILE, make_dirs=True, source="configured")
self.app_peer_data["units-added-to-cluster"] = "1"
self.unit_peer_data["unit-initialized"] = "True"

self.unit.status = ActiveStatus()
except MySQLCreateClusterError as e:
self.unit.status = BlockedStatus("Unable to create cluster")
logger.debug("Unable to create cluster: {}".format(e))

def _on_update_status(self, event: UpdateStatusEvent) -> None:
"""Handle the update status event.
Expand Down
1 change: 0 additions & 1 deletion src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

PASSWORD_LENGTH = 24
PEER = "database-peers"
CONFIGURED_FILE = "/var/lib/mysql/charmed"
CONTAINER_NAME = "mysql"
MYSQLD_SERVICE = "mysqld"
ROOT_USERNAME = "root"
Expand Down
18 changes: 16 additions & 2 deletions tests/integration/high_availability/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
# See LICENSE file for licensing details.

import pytest
from integration.high_availability.high_availability_helpers import get_application_name
from integration.high_availability.high_availability_helpers import (
deploy_chaos_mesh,
destroy_chaos_mesh,
get_application_name,
)
from pytest_operator.plugin import OpsTest


@pytest.fixture()
async def continuous_writes(ops_test: OpsTest):
async def continuous_writes(ops_test: OpsTest) -> None:
"""Starts continuous writes to the MySQL cluster for a test and clear the writes at the end."""
application_name = await get_application_name(ops_test, "application")

Expand All @@ -24,3 +28,13 @@ async def continuous_writes(ops_test: OpsTest):

clear_writes_action = await application_unit.run_action("clear-continuous-writes")
await clear_writes_action.wait()


@pytest.fixture()
async def chaos_mesh(ops_test: OpsTest) -> None:
"""Deploys choas mesh to the namespace and uninstalls it at the end."""
deploy_chaos_mesh(ops_test.model.info.name)

yield

destroy_chaos_mesh(ops_test.model.info.name)
102 changes: 101 additions & 1 deletion tests/integration/high_availability/high_availability_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
# See LICENSE file for licensing details.

import logging
import os
import string
import subprocess
import tempfile
from pathlib import Path
from typing import List, Optional, Tuple

Expand All @@ -19,7 +23,14 @@
)
from juju.unit import Unit
from pytest_operator.plugin import OpsTest
from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed
from tenacity import (
RetryError,
Retrying,
retry,
stop_after_attempt,
stop_after_delay,
wait_fixed,
)

# Copied these values from high_availability.application_charm.src.charm
DATABASE_NAME = "continuous_writes_database"
Expand Down Expand Up @@ -228,6 +239,45 @@ async def relate_mysql_and_application(
)


def deploy_chaos_mesh(namespace: str) -> None:
"""Deploy chaos mesh to the provided namespace.
Args:
ops_test: The ops test framework
namespace: The namespace to deploy chaos mesh to
"""
env = os.environ
env["KUBECONFIG"] = os.path.expanduser("~/.kube/config")

subprocess.check_output(
" ".join(
[
"tests/integration/high_availability/scripts/deploy_chaos_mesh.sh",
namespace,
]
),
shell=True,
env=env,
)


def destroy_chaos_mesh(namespace: str) -> None:
"""Remove chaos mesh from the provided namespace.
Args:
ops_test: The ops test framework
namespace: The namespace to deploy chaos mesh to
"""
env = os.environ
env["KUBECONFIG"] = os.path.expanduser("~/.kube/config")

subprocess.check_output(
f"tests/integration/high_availability/scripts/destroy_chaos_mesh.sh {namespace}",
shell=True,
env=env,
)


async def high_availability_test_setup(ops_test: OpsTest) -> Tuple[str, str]:
"""Run the set up for high availability tests.
Expand Down Expand Up @@ -445,3 +495,53 @@ async def ensure_all_units_continuous_writes_incrementing(
), f"Missing {number} in database for unit {unit.name}"

last_max_written_value = max_written_value


def isolate_instance_from_cluster(ops_test: OpsTest, unit_name: str) -> None:
"""Apply a NetworkChaos file to use chaos-mesh to simulate a network cut."""
with tempfile.NamedTemporaryFile() as temp_file:
with open(
"tests/integration/high_availability/manifests/chaos_network_loss.yml", "r"
) as chaos_network_loss_file:
template = string.Template(chaos_network_loss_file.read())
chaos_network_loss = template.substitute(
namespace=ops_test.model.info.name,
pod=unit_name.replace("/", "-"),
)

temp_file.write(str.encode(chaos_network_loss))
temp_file.flush()

env = os.environ
env["KUBECONFIG"] = os.path.expanduser("~/.kube/config")
subprocess.check_output(
" ".join(["kubectl", "apply", "-f", temp_file.name]), shell=True, env=env
)


def remove_instance_isolation(ops_test: OpsTest) -> None:
"""Delete the NetworkChaos that is isolating the primary unit of the cluster."""
env = os.environ
env["KUBECONFIG"] = os.path.expanduser("~/.kube/config")
subprocess.check_output(
f"kubectl -n {ops_test.model.info.name} delete networkchaos network-loss-primary",
shell=True,
env=env,
)


@retry(
stop=stop_after_attempt(10),
wait=wait_fixed(30),
)
async def wait_until_units_in_status(
ops_test: OpsTest, units_to_check: List[Unit], online_unit: Unit, status: str
) -> None:
"""Waits until all units specified are in a given status, or timeout occurs."""
cluster_status = await get_cluster_status(ops_test, online_unit)

for unit in units_to_check:
assert (
cluster_status["defaultreplicaset"]["topology"][unit.name.replace("/", "-")]["status"]
== status
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
name: network-loss-primary
namespace: $namespace
spec:
action: loss
mode: one
selector:
pods:
$namespace:
- $pod
loss:
loss: "100"
correlation: "100"
duration: "60m"
22 changes: 22 additions & 0 deletions tests/integration/high_availability/scripts/deploy_chaos_mesh.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

chaos_mesh_ns=$1
chaos_mesh_version="2.4.1"

if [ -z "${chaos_mesh_ns}" ]; then
exit 1
fi

deploy_chaos_mesh() {
if [ "$(helm repo list | grep 'chaos-mesh' | wc -l)" != "1" ]; then
echo "adding chaos-mesh helm repo"
helm repo add chaos-mesh https://charts.chaos-mesh.org
fi

echo "installing chaos-mesh"
helm install chaos-mesh chaos-mesh/chaos-mesh --namespace=${chaos_mesh_ns} --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/var/snap/microk8s/common/run/containerd.sock --set dashboard.create=false --version ${chaos_mesh_version} --set clusterScoped=false --set controllerManager.targetNamespace=${chaos_mesh_ns}
sleep 10
}

echo "namespace=${chaos_mesh_ns}"
deploy_chaos_mesh
Loading

0 comments on commit 592a333

Please sign in to comment.