Implement an integration test for recovery after a network cut (#116)

* Refactor scale down logic to not force remove unreachable instances * Add unit tests * Address lint warnings * Add sleep and idle timeout in test code in an attempt to stabilize tests * Address lint warnings * Add sleep after killing pod to give test environment time to stabilize * Wait for unit to be removed from cluster after force removing it * Wait for forcefully removed unit to actually be removed from the cluster * Add integration test for recovery after a network cut * Remove duplicate code introduced after merging main * Install helm3 in CI runner for self healing integration tests * Fix failing unit test * Pin flake8 to v5.0.4 due to incompatibilities with v6 * Install helm on CI runners using sudo * Address PR feedback; move chaos_mesh installation to fixture
canonical · Dec 4, 2022 · 592a333 · 592a333
1 parent 0d8ea2d
commit 592a333
Show file tree

Hide file tree

Showing 10 changed files with 344 additions and 74 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -119,6 +119,8 @@ jobs:
           # This is needed until https://bugs.launchpad.net/juju/+bug/1977582 is fixed
           channel: 1.23/stable
           bootstrap-options: "--agent-version 2.9.29"
+      - name: Install Helm
+        run: /usr/bin/sudo snap install helm3
       - name: Run self healing integration tests
         run: tox -e integration-self-healing
 

diff --git a/src/charm.py b/src/charm.py
@@ -32,8 +32,8 @@
 from constants import (
     CLUSTER_ADMIN_PASSWORD_KEY,
     CLUSTER_ADMIN_USERNAME,
-    CONFIGURED_FILE,
     CONTAINER_NAME,
+    MYSQLD_CONFIG_FILE,
     MYSQLD_SERVICE,
     PASSWORD_LENGTH,
     PEER,
@@ -251,7 +251,49 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
                     "app", required_password, generate_random_password(PASSWORD_LENGTH)
                 )
 
-    def _on_mysql_pebble_ready(self, event):
+    def _configure_instance(self, container) -> None:
+        """Configure the instance for use in Group Replication."""
+        try:
+            # Run mysqld for the first time to
+            # bootstrap the data directory and users
+            logger.debug("Initializing instance")
+            self._mysql.initialise_mysqld()
+
+            # Add the pebble layer
+            logger.debug("Adding pebble layer")
+            container.add_layer(MYSQLD_SERVICE, self._pebble_layer, combine=False)
+            container.restart(MYSQLD_SERVICE)
+
+            logger.debug("Waiting for instance to be ready")
+            self._mysql.wait_until_mysql_connection()
+
+            logger.info("Configuring instance")
+            # Configure all base users and revoke privileges from the root users
+            self._mysql.configure_mysql_users()
+            # Configure instance as a cluster node
+            self._mysql.configure_instance()
+
+            self.unit_peer_data["unit-configured"] = "True"
+        except (
+            MySQLConfigureInstanceError,
+            MySQLConfigureMySQLUsersError,
+            MySQLInitialiseMySQLDError,
+            MySQLCreateCustomConfigFileError,
+        ) as e:
+            logger.debug("Unable to configure instance: {}".format(e))
+            return False
+
+        try:
+            # Set workload version
+            workload_version = self._mysql.get_mysql_version()
+            self.unit.set_workload_version(workload_version)
+        except MySQLGetMySQLVersionError:
+            # Do not block the charm if the version cannot be retrieved
+            pass
+
+        return True
+
+    def _on_mysql_pebble_ready(self, event) -> None:
         """Pebble ready handler.
 
         Define and start a pebble service and bootstrap instance.
@@ -264,83 +306,52 @@ def _on_mysql_pebble_ready(self, event):
 
         container = event.workload
 
-        if container.exists(CONFIGURED_FILE):
-            # When reusing a volume
-            # Configure the layer when changed
+        if not container.exists(MYSQLD_CONFIG_FILE):
+            self._mysql.create_custom_config_file(
+                report_host=self.get_unit_hostname(self.unit.name)
+            )
+
+        if self.unit_peer_data.get("unit-configured"):
+            # Only update pebble layer if unit is already configured for GR
             current_layer = container.get_plan()
             new_layer = self._pebble_layer
 
             if new_layer.services != current_layer:
-                logger.info("Add pebble layer")
+                logger.info("Adding pebble layer")
+
                 container.add_layer(MYSQLD_SERVICE, new_layer, combine=True)
                 container.restart(MYSQLD_SERVICE)
                 self._mysql.wait_until_mysql_connection()
+
             self.unit.status = ActiveStatus()
             return
 
-        # First run setup
         self.unit.status = MaintenanceStatus("Initialising mysqld")
-        try:
 
-            # Run mysqld for the first time to
-            # bootstrap the data directory and users
-            logger.debug("Initialising instance")
-            self._mysql.initialise_mysqld()
-
-            # Create custom server config file
-            logger.debug("Create custom config")
-            self._mysql.create_custom_config_file(
-                report_host=self.get_unit_hostname(self.unit.name)
-            )
-
-            # Add the pebble layer
-            container.add_layer(MYSQLD_SERVICE, self._pebble_layer, combine=False)
-            container.restart(MYSQLD_SERVICE)
-            logger.debug("Waiting for instance to be ready")
-            self._mysql.wait_until_mysql_connection()
-            logger.info("Configuring instance")
-            # Configure all base users and revoke
-            # privileges from the root users
-            self._mysql.configure_mysql_users()
-            # Configure instance as a cluster node
-            self._mysql.configure_instance()
-            # set workload version
-            workload_version = self._mysql.get_mysql_version()
-            self.unit.set_workload_version(workload_version)
-
-        except (
-            MySQLConfigureInstanceError,
-            MySQLConfigureMySQLUsersError,
-            MySQLInitialiseMySQLDError,
-            MySQLCreateCustomConfigFileError,
-        ) as e:
+        # First run setup
+        if not self._configure_instance(container):
             self.unit.status = BlockedStatus("Unable to configure instance")
-            logger.debug("Unable to configure instance: {}".format(e))
             return
-        except MySQLGetMySQLVersionError:
-            # Do not block the charm if the version cannot be retrieved
-            pass
 
-        if self.unit.is_leader():
-            try:
-                # Create the cluster when is the leader unit
-                unit_label = self.unit.name.replace("/", "-")
-                self._mysql.create_cluster(unit_label)
-                logger.debug("Cluster configured on unit")
-                # Create control file in data directory
-                container.push(CONFIGURED_FILE, make_dirs=True, source="configured")
-                self.app_peer_data["units-added-to-cluster"] = "1"
-                self.unit_peer_data["unit-initialized"] = "True"
-                self.unit.status = ActiveStatus()
-            except MySQLCreateClusterError as e:
-                self.unit.status = BlockedStatus("Unable to create cluster")
-                logger.debug("Unable to create cluster: {}".format(e))
-        else:
-            # When unit is not the leader, it should wait
-            # for the leader to configure it a cluster node
+        if not self.unit.is_leader():
+            # Non-leader units should wait for leader to add them to the cluster
             self.unit.status = WaitingStatus("Waiting for instance to join the cluster")
+            return
+
+        try:
+            # Create the cluster when is the leader unit
+            logger.info("Creating cluster on the leader unit")
+            unit_label = self.unit.name.replace("/", "-")
+            self._mysql.create_cluster(unit_label)
+
             # Create control file in data directory
-            container.push(CONFIGURED_FILE, make_dirs=True, source="configured")
+            self.app_peer_data["units-added-to-cluster"] = "1"
+            self.unit_peer_data["unit-initialized"] = "True"
+
+            self.unit.status = ActiveStatus()
+        except MySQLCreateClusterError as e:
+            self.unit.status = BlockedStatus("Unable to create cluster")
+            logger.debug("Unable to create cluster: {}".format(e))
 
     def _on_update_status(self, event: UpdateStatusEvent) -> None:
         """Handle the update status event.

diff --git a/src/constants.py b/src/constants.py
@@ -5,7 +5,6 @@
 
 PASSWORD_LENGTH = 24
 PEER = "database-peers"
-CONFIGURED_FILE = "/var/lib/mysql/charmed"
 CONTAINER_NAME = "mysql"
 MYSQLD_SERVICE = "mysqld"
 ROOT_USERNAME = "root"

diff --git a/tests/integration/high_availability/conftest.py b/tests/integration/high_availability/conftest.py
@@ -3,12 +3,16 @@
 # See LICENSE file for licensing details.
 
 import pytest
-from integration.high_availability.high_availability_helpers import get_application_name
+from integration.high_availability.high_availability_helpers import (
+    deploy_chaos_mesh,
+    destroy_chaos_mesh,
+    get_application_name,
+)
 from pytest_operator.plugin import OpsTest
 
 
 @pytest.fixture()
-async def continuous_writes(ops_test: OpsTest):
+async def continuous_writes(ops_test: OpsTest) -> None:
     """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end."""
     application_name = await get_application_name(ops_test, "application")
 
@@ -24,3 +28,13 @@ async def continuous_writes(ops_test: OpsTest):
 
     clear_writes_action = await application_unit.run_action("clear-continuous-writes")
     await clear_writes_action.wait()
+
+
+@pytest.fixture()
+async def chaos_mesh(ops_test: OpsTest) -> None:
+    """Deploys choas mesh to the namespace and uninstalls it at the end."""
+    deploy_chaos_mesh(ops_test.model.info.name)
+
+    yield
+
+    destroy_chaos_mesh(ops_test.model.info.name)
diff --git a/tests/integration/high_availability/high_availability_helpers.py b/tests/integration/high_availability/high_availability_helpers.py
@@ -2,6 +2,10 @@
 # See LICENSE file for licensing details.
 
 import logging
+import os
+import string
+import subprocess
+import tempfile
 from pathlib import Path
 from typing import List, Optional, Tuple
 
@@ -19,7 +23,14 @@
 )
 from juju.unit import Unit
 from pytest_operator.plugin import OpsTest
-from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed
+from tenacity import (
+    RetryError,
+    Retrying,
+    retry,
+    stop_after_attempt,
+    stop_after_delay,
+    wait_fixed,
+)
 
 # Copied these values from high_availability.application_charm.src.charm
 DATABASE_NAME = "continuous_writes_database"
@@ -228,6 +239,45 @@ async def relate_mysql_and_application(
     )
 
 
+def deploy_chaos_mesh(namespace: str) -> None:
+    """Deploy chaos mesh to the provided namespace.
+
+    Args:
+        ops_test: The ops test framework
+        namespace: The namespace to deploy chaos mesh to
+    """
+    env = os.environ
+    env["KUBECONFIG"] = os.path.expanduser("~/.kube/config")
+
+    subprocess.check_output(
+        " ".join(
+            [
+                "tests/integration/high_availability/scripts/deploy_chaos_mesh.sh",
+                namespace,
+            ]
+        ),
+        shell=True,
+        env=env,
+    )
+
+
+def destroy_chaos_mesh(namespace: str) -> None:
+    """Remove chaos mesh from the provided namespace.
+
+    Args:
+        ops_test: The ops test framework
+        namespace: The namespace to deploy chaos mesh to
+    """
+    env = os.environ
+    env["KUBECONFIG"] = os.path.expanduser("~/.kube/config")
+
+    subprocess.check_output(
+        f"tests/integration/high_availability/scripts/destroy_chaos_mesh.sh {namespace}",
+        shell=True,
+        env=env,
+    )
+
+
 async def high_availability_test_setup(ops_test: OpsTest) -> Tuple[str, str]:
     """Run the set up for high availability tests.
 
@@ -445,3 +495,53 @@ async def ensure_all_units_continuous_writes_incrementing(
                         ), f"Missing {number} in database for unit {unit.name}"
 
                     last_max_written_value = max_written_value
+
+
+def isolate_instance_from_cluster(ops_test: OpsTest, unit_name: str) -> None:
+    """Apply a NetworkChaos file to use chaos-mesh to simulate a network cut."""
+    with tempfile.NamedTemporaryFile() as temp_file:
+        with open(
+            "tests/integration/high_availability/manifests/chaos_network_loss.yml", "r"
+        ) as chaos_network_loss_file:
+            template = string.Template(chaos_network_loss_file.read())
+            chaos_network_loss = template.substitute(
+                namespace=ops_test.model.info.name,
+                pod=unit_name.replace("/", "-"),
+            )
+
+            temp_file.write(str.encode(chaos_network_loss))
+            temp_file.flush()
+
+        env = os.environ
+        env["KUBECONFIG"] = os.path.expanduser("~/.kube/config")
+        subprocess.check_output(
+            " ".join(["kubectl", "apply", "-f", temp_file.name]), shell=True, env=env
+        )
+
+
+def remove_instance_isolation(ops_test: OpsTest) -> None:
+    """Delete the NetworkChaos that is isolating the primary unit of the cluster."""
+    env = os.environ
+    env["KUBECONFIG"] = os.path.expanduser("~/.kube/config")
+    subprocess.check_output(
+        f"kubectl -n {ops_test.model.info.name} delete networkchaos network-loss-primary",
+        shell=True,
+        env=env,
+    )
+
+
+@retry(
+    stop=stop_after_attempt(10),
+    wait=wait_fixed(30),
+)
+async def wait_until_units_in_status(
+    ops_test: OpsTest, units_to_check: List[Unit], online_unit: Unit, status: str
+) -> None:
+    """Waits until all units specified are in a given status, or timeout occurs."""
+    cluster_status = await get_cluster_status(ops_test, online_unit)
+
+    for unit in units_to_check:
+        assert (
+            cluster_status["defaultreplicaset"]["topology"][unit.name.replace("/", "-")]["status"]
+            == status
+        )
diff --git a/tests/integration/high_availability/manifests/chaos_network_loss.yml b/tests/integration/high_availability/manifests/chaos_network_loss.yml
@@ -0,0 +1,16 @@
+apiVersion: chaos-mesh.org/v1alpha1
+kind: NetworkChaos
+metadata:
+  name: network-loss-primary
+  namespace: $namespace
+spec:
+  action: loss
+  mode: one
+  selector:
+    pods:
+      $namespace:
+        - $pod
+  loss:
+    loss: "100"
+    correlation: "100"
+  duration: "60m"
diff --git a/tests/integration/high_availability/scripts/deploy_chaos_mesh.sh b/tests/integration/high_availability/scripts/deploy_chaos_mesh.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+chaos_mesh_ns=$1
+chaos_mesh_version="2.4.1"
+
+if [ -z "${chaos_mesh_ns}" ]; then
+	exit 1
+fi
+
+deploy_chaos_mesh() {
+	if [ "$(helm repo list | grep 'chaos-mesh' | wc -l)" != "1" ]; then
+		echo "adding chaos-mesh helm repo"
+		helm repo add chaos-mesh https://charts.chaos-mesh.org
+	fi
+
+	echo "installing chaos-mesh"
+    helm install chaos-mesh chaos-mesh/chaos-mesh --namespace=${chaos_mesh_ns} --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/var/snap/microk8s/common/run/containerd.sock --set dashboard.create=false --version ${chaos_mesh_version} --set clusterScoped=false --set controllerManager.targetNamespace=${chaos_mesh_ns}
+	sleep 10
+}
+
+echo "namespace=${chaos_mesh_ns}"
+deploy_chaos_mesh