Skip to content

Commit

Permalink
DPE-4699 backup logging and other fixes (#444)
Browse files Browse the repository at this point in the history
* better messaging for backup and extra timeout for service stop

* refinements and bug fixes:

* move s3 relation validation to lib
* fixed validation call (missing property decorator)
* wont rotate logs if mysqld not running
* kill client connection prior to stopping service
* enable streamed debug log for backup download

* libpatch bump

* rsync changes from vm

* replicated from vm

* bump libs

* testing split backup test

* leverage greater amount of runners

* strip output

* set offline mode on instance

* bump lib
  • Loading branch information
paulomach authored Jul 5, 2024
1 parent 36254a9 commit 2c0fe44
Show file tree
Hide file tree
Showing 13 changed files with 1,582 additions and 649 deletions.
4 changes: 2 additions & 2 deletions lib/charms/data_platform_libs/v0/upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def restart(self, event) -> None:

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 17
LIBPATCH = 18

PYDEPS = ["pydantic>=1.10,<2", "poetry-core"]

Expand Down Expand Up @@ -921,7 +921,7 @@ def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None:
self.charm.unit.status = WaitingStatus("other units upgrading first...")
self.peer_relation.data[self.charm.unit].update({"state": "ready"})

if self.charm.app.planned_units() == 1:
if len(self.app_units) == 1:
# single unit upgrade, emit upgrade_granted event right away
getattr(self.on, "upgrade_granted").emit()

Expand Down
100 changes: 59 additions & 41 deletions lib/charms/mysql/v0/backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,14 @@ def is_unit_blocked(self) -> bool:
from charms.mysql.v0.mysql import (
MySQLConfigureInstanceError,
MySQLCreateClusterError,
MySQLCreateClusterSetError,
MySQLDeleteTempBackupDirectoryError,
MySQLDeleteTempRestoreDirectoryError,
MySQLEmptyDataDirectoryError,
MySQLExecuteBackupCommandsError,
MySQLGetMemberStateError,
MySQLInitializeJujuOperationsTableError,
MySQLKillSessionError,
MySQLOfflineModeAndHiddenInstanceExistsError,
MySQLPrepareBackupForRestoreError,
MySQLRescanClusterError,
Expand All @@ -80,13 +82,14 @@ def is_unit_blocked(self) -> bool:
from ops.charm import ActionEvent
from ops.framework import Object
from ops.jujuversion import JujuVersion
from ops.model import ActiveStatus, BlockedStatus
from ops.model import BlockedStatus, MaintenanceStatus

from constants import MYSQL_DATA_DIR

logger = logging.getLogger(__name__)

MYSQL_BACKUPS = "mysql-backups"
S3_INTEGRATOR_RELATION_NAME = "s3-parameters"

# The unique Charmhub library identifier, never change it
LIBID = "183844304be247129572309a5fb1e47c"
Expand All @@ -96,7 +99,7 @@ def is_unit_blocked(self) -> bool:

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 8
LIBPATCH = 10


if typing.TYPE_CHECKING:
Expand All @@ -117,6 +120,10 @@ def __init__(self, charm: "MySQLOperatorCharm", s3_integrator: S3Requirer) -> No
self.framework.observe(self.charm.on.restore_action, self._on_restore)

# ------------------ Helpers ------------------
@property
def _s3_integrator_relation_exists(self) -> bool:
"""Returns whether a relation with the s3-integrator exists."""
return bool(self.model.get_relation(S3_INTEGRATOR_RELATION_NAME))

def _retrieve_s3_parameters(self) -> Tuple[Dict[str, str], List[str]]:
"""Retrieve S3 parameters from the S3 integrator relation.
Expand Down Expand Up @@ -176,11 +183,7 @@ def _upload_logs_to_s3(
Returns: bool indicating success
"""
logs = f"""Stdout:
{stdout}
Stderr:
{stderr}"""
logs = f"Stdout:\n{stdout}\n\nStderr:\n{stderr}"
logger.debug(f"Logs to upload to S3 at location {log_filename}:\n{logs}")

logger.info(
Expand All @@ -206,7 +209,7 @@ def _on_list_backups(self, event: ActionEvent) -> None:
List backups available to restore by this application.
"""
if not self.charm.s3_integrator_relation_exists:
if not self._s3_integrator_relation_exists:
event.fail("Missing relation with S3 integrator charm")
return

Expand All @@ -222,7 +225,9 @@ def _on_list_backups(self, event: ActionEvent) -> None:
event.set_results({"backups": self._format_backups_list(backups)})
except Exception as e:
error_message = (
e.message if hasattr(e, "message") else "Failed to retrieve backup ids from S3"
getattr(e, "message")
if hasattr(e, "message")
else "Failed to retrieve backup ids from S3"
)
logger.error(error_message)
event.fail(error_message)
Expand All @@ -233,7 +238,7 @@ def _on_create_backup(self, event: ActionEvent) -> None:
"""Handle the create backup action."""
logger.info("A backup has been requested on unit")

if not self.charm.s3_integrator_relation_exists:
if not self._s3_integrator_relation_exists:
logger.error("Backup failed: missing relation with S3 integrator charm")
event.fail("Missing relation with S3 integrator charm")
return
Expand Down Expand Up @@ -263,12 +268,13 @@ def _on_create_backup(self, event: ActionEvent) -> None:

# Test uploading metadata to S3 to test credentials before backup
juju_version = JujuVersion.from_environ()
metadata = f"""Date Backup Requested: {datetime_backup_requested}
Model Name: {self.model.name}
Application Name: {self.model.app.name}
Unit Name: {self.charm.unit.name}
Juju Version: {str(juju_version)}
"""
metadata = (
f"Date Backup Requested: {datetime_backup_requested}\n"
f"Model Name: {self.model.name}\n"
f"Application Name: {self.model.app.name}\n"
f"Unit Name: {self.charm.unit.name}\n"
f"Juju Version: {str(juju_version)}\n"
)

if not upload_content_to_s3(metadata, f"{backup_path}.metadata", s3_parameters):
logger.error("Backup failed: Failed to upload metadata to provided S3")
Expand Down Expand Up @@ -313,6 +319,7 @@ def _on_create_backup(self, event: ActionEvent) -> None:
"backup-id": datetime_backup_requested,
}
)
self.charm._on_update_status(None)

def _can_unit_perform_backup(self) -> Tuple[bool, Optional[str]]:
"""Validates whether this unit can perform a backup.
Expand Down Expand Up @@ -380,8 +387,9 @@ def _backup(self, backup_path: str, s3_parameters: Dict) -> Tuple[bool, Optional
Returns: tuple of (success, error_message)
"""
try:
self.charm.unit.status = MaintenanceStatus("Running backup...")
logger.info("Running the xtrabackup commands")
stdout = self.charm._mysql.execute_backup_commands(
stdout, _ = self.charm._mysql.execute_backup_commands(
backup_path,
s3_parameters,
)
Expand Down Expand Up @@ -436,7 +444,7 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool:
Returns: a boolean indicating whether restore should be run
"""
if not self.charm.s3_integrator_relation_exists:
if not self._s3_integrator_relation_exists:
error_message = "Missing relation with S3 integrator charm"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
Expand Down Expand Up @@ -481,7 +489,7 @@ def _on_restore(self, event: ActionEvent) -> None:
if not self._pre_restore_checks(event):
return

backup_id = event.params.get("backup-id").strip().strip("/")
backup_id = event.params["backup-id"].strip().strip("/")
logger.info(f"A restore with backup-id {backup_id} has been requested on unit")

# Retrieve and validate missing S3 parameters
Expand All @@ -500,6 +508,7 @@ def _on_restore(self, event: ActionEvent) -> None:
return

# Run operations to prepare for the restore
self.charm.unit.status = MaintenanceStatus("Running pre-restore operations")
success, error_message = self._pre_restore()
if not success:
logger.error(f"Restore failed: {error_message}")
Expand All @@ -520,6 +529,7 @@ def _on_restore(self, event: ActionEvent) -> None:
return

# Run post-restore operations
self.charm.unit.status = MaintenanceStatus("Running post-restore operations")
success, error_message = self._post_restore()
if not success:
logger.error(f"Restore failed: {error_message}")
Expand All @@ -533,22 +543,32 @@ def _on_restore(self, event: ActionEvent) -> None:
"completed": "ok",
}
)
# update status as soon as possible
self.charm._on_update_status(None)

def _pre_restore(self) -> Tuple[bool, Optional[str]]:
def _pre_restore(self) -> Tuple[bool, str]:
"""Perform operations that need to be done before performing a restore.
Returns: tuple of (success, error_message)
"""
if not self.charm._mysql.is_mysqld_running():
return True, ""

try:
logger.info("Stopping mysqld before restoring the backup")
self.charm._mysql.kill_client_sessions()
self.charm._mysql.set_instance_offline_mode(True)
self.charm._mysql.stop_mysqld()
except MySQLKillSessionError:
return False, "Failed to kill client sessions"
except MySQLSetInstanceOfflineModeError:
return False, "Failed to set instance as offline before restoring the backup"
except MySQLStopMySQLDError:
return False, "Failed to stop mysqld"

return True, None
return True, ""

def _restore(
self, backup_id: str, s3_parameters: Dict[str, str]
) -> Tuple[bool, bool, Optional[str]]:
def _restore(self, backup_id: str, s3_parameters: Dict[str, str]) -> Tuple[bool, bool, str]:
"""Run the restore operations.
Args:
Expand All @@ -558,18 +578,21 @@ def _restore(
Returns: tuple of (success, recoverable_error, error_message)
"""
try:
logger.info("Running xbcloud get commands to retrieve the backup")
logger.info(
"Running xbcloud get commands to retrieve the backup\n"
"This operation can take long time depending on backup size and network speed"
)
self.charm.unit.status = MaintenanceStatus("Downloading backup...")
stdout, stderr, backup_location = self.charm._mysql.retrieve_backup_with_xbcloud(
backup_id,
s3_parameters,
)
logger.debug(f"Stdout of xbcloud get commands: {stdout}")
logger.debug(f"Stderr of xbcloud get commands: {stderr}")
except MySQLRetrieveBackupWithXBCloudError:
return False, True, f"Failed to retrieve backup {backup_id}"

try:
logger.info("Preparing retrieved backup using xtrabackup prepare")
self.charm.unit.status = MaintenanceStatus("Preparing for restore backup...")
stdout, stderr = self.charm._mysql.prepare_backup_for_restore(backup_location)
logger.debug(f"Stdout of xtrabackup prepare command: {stdout}")
logger.debug(f"Stderr of xtrabackup prepare command: {stderr}")
Expand All @@ -583,16 +606,17 @@ def _restore(
return False, False, "Failed to empty the data directory"

try:
self.charm.unit.status = MaintenanceStatus("Restoring backup...")
logger.info("Restoring the backup")
stdout, stderr = self.charm._mysql.restore_backup(backup_location)
logger.debug(f"Stdout of xtrabackup move-back command: {stdout}")
logger.debug(f"Stderr of xtrabackup move-back command: {stderr}")
except MySQLRestoreBackupError:
return False, False, f"Failed to restore backup {backup_id}"

return True, True, None
return True, True, ""

def _clean_data_dir_and_start_mysqld(self) -> Tuple[bool, Optional[str]]:
def _clean_data_dir_and_start_mysqld(self) -> Tuple[bool, str]:
"""Run idempotent operations run after restoring a backup.
Returns tuple of (success, error_message)
Expand All @@ -613,9 +637,9 @@ def _clean_data_dir_and_start_mysqld(self) -> Tuple[bool, Optional[str]]:
except MySQLStartMySQLDError:
return False, "Failed to start mysqld"

return True, None
return True, ""

def _post_restore(self) -> Tuple[bool, Optional[str]]:
def _post_restore(self) -> Tuple[bool, str]:
"""Run operations required after restoring a backup.
Returns: tuple of (success, error_message)
Expand All @@ -638,24 +662,18 @@ def _post_restore(self) -> Tuple[bool, Optional[str]]:
logger.info("Creating cluster on restored node")
unit_label = self.charm.unit.name.replace("/", "-")
self.charm._mysql.create_cluster(unit_label)
self.charm._mysql.create_cluster_set()
self.charm._mysql.initialize_juju_units_operations_table()

self.charm._mysql.rescan_cluster()

logger.info("Retrieving instance cluster state and role")
state, role = self.charm._mysql.get_member_state()
except MySQLCreateClusterError:
return False, "Failed to create InnoDB cluster on restored instance"
except MySQLCreateClusterSetError:
return False, "Failed to create InnoDB cluster-set on restored instance"
except MySQLInitializeJujuOperationsTableError:
return False, "Failed to initialize the juju operations table"
except MySQLRescanClusterError:
return False, "Failed to rescan the cluster"
except MySQLGetMemberStateError:
return False, "Failed to retrieve member state in restored instance"

self.charm.unit_peer_data["member-role"] = role
self.charm.unit_peer_data["member-state"] = state

self.charm.unit.status = ActiveStatus()

return True, None
return True, ""
Loading

0 comments on commit 2c0fe44

Please sign in to comment.