From 35b1730d982ab0b6a181d26ab412241d92dc3edd Mon Sep 17 00:00:00 2001 From: Anushka Singh Date: Wed, 27 Dec 2023 14:41:58 -0500 Subject: [PATCH] Bugfix#932: Investigate why some shares did not go to failed state, but remained stuck or in-progress --- .../db/share_object_repositories.py | 20 ++++---- .../services/data_sharing_service.py | 2 +- .../services/dataset_alarm_service.py | 12 ++--- .../share_managers/lf_share_manager.py | 48 +++++++++++-------- .../s3_access_point_share_manager.py | 2 +- .../share_managers/s3_bucket_share_manager.py | 2 +- .../lf_process_cross_account_share.py | 4 +- .../lf_process_same_account_share.py | 4 +- .../s3_access_point_process_share.py | 2 +- .../s3_bucket_process_share.py | 4 +- .../tasks/test_s3_bucket_share_manager.py | 2 +- 11 files changed, 57 insertions(+), 45 deletions(-) diff --git a/backend/dataall/modules/dataset_sharing/db/share_object_repositories.py b/backend/dataall/modules/dataset_sharing/db/share_object_repositories.py index e2944485d..af549a8b5 100644 --- a/backend/dataall/modules/dataset_sharing/db/share_object_repositories.py +++ b/backend/dataall/modules/dataset_sharing/db/share_object_repositories.py @@ -288,14 +288,18 @@ def update_state(self, session, share_uri, new_state): return True def update_state_single_item(self, session, share_item, new_state): - logger.info(f"Updating share item in DB {share_item.shareItemUri} status to {new_state}") - ShareObjectRepository.update_share_item_status( - session=session, - uri=share_item.shareItemUri, - status=new_state - ) - self._state = new_state - return True + try: + logger.info(f"Updating share item in DB {share_item.shareItemUri} status to {new_state}") + ShareObjectRepository.update_share_item_status( + session=session, + uri=share_item.shareItemUri, + status=new_state + ) + self._state = new_state + return True + except Exception as e: + logger.error(f"Exception during update_state_single_item: {e}") + return False @staticmethod def get_share_item_shared_states(): diff --git a/backend/dataall/modules/dataset_sharing/services/data_sharing_service.py b/backend/dataall/modules/dataset_sharing/services/data_sharing_service.py index 7139d784d..fd54f5074 100644 --- a/backend/dataall/modules/dataset_sharing/services/data_sharing_service.py +++ b/backend/dataall/modules/dataset_sharing/services/data_sharing_service.py @@ -266,4 +266,4 @@ def revoke_share(cls, engine: Engine, share_uri: str): new_share_state = share_sm.run_transition(ShareObjectActions.Finish.value) share_sm.update_state(session, share, new_share_state) - return revoked_folders_succeed and revoked_s3_buckets_succeed and revoked_tables_succeed \ No newline at end of file + return revoked_folders_succeed and revoked_s3_buckets_succeed and revoked_tables_succeed diff --git a/backend/dataall/modules/dataset_sharing/services/dataset_alarm_service.py b/backend/dataall/modules/dataset_sharing/services/dataset_alarm_service.py index 1d003297e..628f55f51 100644 --- a/backend/dataall/modules/dataset_sharing/services/dataset_alarm_service.py +++ b/backend/dataall/modules/dataset_sharing/services/dataset_alarm_service.py @@ -19,7 +19,7 @@ def trigger_table_sharing_failure_alarm( target_environment: Environment, ): log.info('Triggering share failure alarm...') - subject = f'ALARM: DATAALL Table {table.GlueTableName} Sharing Failure Notification'[:100] + subject = f'ALARM: DATAALL Sharing Failure Notification for Table {table.GlueTableName}'[:100] message = f""" You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to share the table {table.GlueTableName} with Lake Formation. @@ -49,7 +49,7 @@ def trigger_revoke_table_sharing_failure_alarm( target_environment: Environment, ): log.info('Triggering share failure alarm...') - subject = f'ALARM: DATAALL Table {table.GlueTableName} Revoking LF permissions Failure Notification'[:100] + subject = f'ALARM: DATAALL Revoking LF permissions Failure Notification for Table {table.GlueTableName}'[:100] message = f""" You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to revoke Lake Formation permissions for table {table.GlueTableName} with Lake Formation. @@ -74,7 +74,7 @@ def trigger_revoke_table_sharing_failure_alarm( def trigger_dataset_sync_failure_alarm(self, dataset: Dataset, error: str): log.info(f'Triggering dataset {dataset.name} tables sync failure alarm...') - subject = f'ALARM: DATAALL Dataset {dataset.name} Tables Sync Failure Notification'[:100] + subject = f'ALARM: DATAALL Dataset Tables Sync Failure Notification for {dataset.name}'[:100] message = f""" You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to synchronize Dataset {dataset.name} tables from AWS Glue to the Search Catalog. @@ -97,7 +97,7 @@ def trigger_folder_sharing_failure_alarm( target_environment: Environment, ): log.info('Triggering share failure alarm...') - subject = f'ALARM: DATAALL Folder {folder.S3Prefix} Sharing Failure Notification'[:100] + subject = f'ALARM: DATAALL Folder Sharing Failure Notification for {folder.S3Prefix}'[:100] message = f""" You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to share the folder {folder.S3Prefix} with S3 Access Point. Alarm Details: @@ -123,7 +123,7 @@ def trigger_revoke_folder_sharing_failure_alarm( target_environment: Environment, ): log.info('Triggering share failure alarm...') - subject = f'ALARM: DATAALL Folder {folder.S3Prefix} Sharing Revoke Failure'[:100] + subject = f'ALARM: DATAALL Folder Sharing Revoke Failure for {folder.S3Prefix}'[:100] message = f""" You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to share the folder {folder.S3Prefix} with S3 Access Point. Alarm Details: @@ -165,7 +165,7 @@ def handle_bucket_sharing_failure(self, bucket: DatasetBucket, target_environment: Environment, alarm_type: str): log.info(f'Triggering {alarm_type} failure alarm...') - subject = f'ALARM: DATAALL S3 Bucket {bucket.S3BucketName} {alarm_type} Failure'[:100] + subject = f'ALARM: DATAALL S3 Bucket Failure Notification for {bucket.S3BucketName} {alarm_type}'[:100] message = f""" You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to {alarm_type} the S3 Bucket {bucket.S3BucketName}. Alarm Details: diff --git a/backend/dataall/modules/dataset_sharing/services/share_managers/lf_share_manager.py b/backend/dataall/modules/dataset_sharing/services/share_managers/lf_share_manager.py index df0c8a09d..ec6565e63 100644 --- a/backend/dataall/modules/dataset_sharing/services/share_managers/lf_share_manager.py +++ b/backend/dataall/modules/dataset_sharing/services/share_managers/lf_share_manager.py @@ -480,17 +480,21 @@ def handle_share_failure( ------- True if alarm published successfully """ - logging.error( - f'Failed to share table {table.GlueTableName} ' - f'from source account {self.source_environment.AwsAccountId}//{self.source_environment.region} ' - f'with target account {self.target_environment.AwsAccountId}/{self.target_environment.region}' - f'due to: {error}' - ) + try: + logging.error( + f'Failed to share table {table.GlueTableName} ' + f'from source account {self.source_environment.AwsAccountId}//{self.source_environment.region} ' + f'with target account {self.target_environment.AwsAccountId}/{self.target_environment.region}' + f'due to: {error}' + ) - DatasetAlarmService().trigger_table_sharing_failure_alarm( - table, self.share, self.target_environment - ) - return True + DatasetAlarmService().trigger_table_sharing_failure_alarm( + table, self.share, self.target_environment + ) + return True + except Exception as e: + logger.error(f"Exception during handle_share_failure: {e}") + return False def handle_revoke_failure( self, @@ -504,16 +508,20 @@ def handle_revoke_failure( ------- True if alarm published successfully """ - logger.error( - f'Failed to revoke S3 permissions to table {table.GlueTableName} ' - f'from source account {self.source_environment.AwsAccountId}//{self.source_environment.region} ' - f'with target account {self.target_environment.AwsAccountId}/{self.target_environment.region} ' - f'due to: {error}' - ) - DatasetAlarmService().trigger_revoke_table_sharing_failure_alarm( - table, self.share, self.target_environment - ) - return True + try: + logger.error( + f'Failed to revoke S3 permissions to table {table.GlueTableName} ' + f'from source account {self.source_environment.AwsAccountId}//{self.source_environment.region} ' + f'with target account {self.target_environment.AwsAccountId}/{self.target_environment.region} ' + f'due to: {error}' + ) + DatasetAlarmService().trigger_revoke_table_sharing_failure_alarm( + table, self.share, self.target_environment + ) + return True + except Exception as e: + logger.error(f"Exception during handle_revoke_failure: {e}") + return False def glue_client(self): return GlueClient( diff --git a/backend/dataall/modules/dataset_sharing/services/share_managers/s3_access_point_share_manager.py b/backend/dataall/modules/dataset_sharing/services/share_managers/s3_access_point_share_manager.py index 1591ac227..ae1c02ca4 100644 --- a/backend/dataall/modules/dataset_sharing/services/share_managers/s3_access_point_share_manager.py +++ b/backend/dataall/modules/dataset_sharing/services/share_managers/s3_access_point_share_manager.py @@ -553,4 +553,4 @@ def get_principal_list(statement): principal_list = statement["Principal"]["AWS"] if isinstance(principal_list, str): principal_list = [principal_list] - return principal_list \ No newline at end of file + return principal_list diff --git a/backend/dataall/modules/dataset_sharing/services/share_managers/s3_bucket_share_manager.py b/backend/dataall/modules/dataset_sharing/services/share_managers/s3_bucket_share_manager.py index 00c02b109..924dcd80c 100644 --- a/backend/dataall/modules/dataset_sharing/services/share_managers/s3_bucket_share_manager.py +++ b/backend/dataall/modules/dataset_sharing/services/share_managers/s3_bucket_share_manager.py @@ -481,4 +481,4 @@ def generate_default_kms_decrypt_policy_statement(target_requester_arn): }, "Action": "kms:Decrypt", "Resource": "*" - } \ No newline at end of file + } diff --git a/backend/dataall/modules/dataset_sharing/services/share_processors/lf_process_cross_account_share.py b/backend/dataall/modules/dataset_sharing/services/share_processors/lf_process_cross_account_share.py index 51ba97cc7..a9ed54657 100644 --- a/backend/dataall/modules/dataset_sharing/services/share_processors/lf_process_cross_account_share.py +++ b/backend/dataall/modules/dataset_sharing/services/share_processors/lf_process_cross_account_share.py @@ -111,10 +111,10 @@ def process_approved_shares(self) -> bool: shared_item_SM.update_state_single_item(self.session, share_item, new_state) except Exception as e: - self.handle_share_failure(table=table, share_item=share_item, error=e) new_state = shared_item_SM.run_transition(ShareItemActions.Failure.value) shared_item_SM.update_state_single_item(self.session, share_item, new_state) success = False + self.handle_share_failure(table=table, share_item=share_item, error=e) return success @@ -178,9 +178,9 @@ def process_revoked_shares(self) -> bool: revoked_item_SM.update_state_single_item(self.session, share_item, new_state) except Exception as e: - self.handle_revoke_failure(share_item=share_item, table=table, error=e) new_state = revoked_item_SM.run_transition(ShareItemActions.Failure.value) revoked_item_SM.update_state_single_item(self.session, share_item, new_state) success = False + self.handle_revoke_failure(share_item=share_item, table=table, error=e) return success diff --git a/backend/dataall/modules/dataset_sharing/services/share_processors/lf_process_same_account_share.py b/backend/dataall/modules/dataset_sharing/services/share_processors/lf_process_same_account_share.py index 54df2d900..56af3d892 100644 --- a/backend/dataall/modules/dataset_sharing/services/share_processors/lf_process_same_account_share.py +++ b/backend/dataall/modules/dataset_sharing/services/share_processors/lf_process_same_account_share.py @@ -95,10 +95,10 @@ def process_approved_shares(self) -> bool: shared_item_SM.update_state_single_item(self.session, share_item, new_state) except Exception as e: - self.handle_share_failure(table, share_item, e) new_state = shared_item_SM.run_transition(ShareItemActions.Failure.value) shared_item_SM.update_state_single_item(self.session, share_item, new_state) success = False + self.handle_share_failure(table, share_item, e) return success @@ -151,9 +151,9 @@ def process_revoked_shares(self) -> bool: revoked_item_SM.update_state_single_item(self.session, share_item, new_state) except Exception as e: - self.handle_revoke_failure(share_item, table, e) new_state = revoked_item_SM.run_transition(ShareItemActions.Failure.value) revoked_item_SM.update_state_single_item(self.session, share_item, new_state) success = False + self.handle_revoke_failure(share_item, table, e) return success diff --git a/backend/dataall/modules/dataset_sharing/services/share_processors/s3_access_point_process_share.py b/backend/dataall/modules/dataset_sharing/services/share_processors/s3_access_point_process_share.py index 043e502c3..da02c5a3c 100644 --- a/backend/dataall/modules/dataset_sharing/services/share_processors/s3_access_point_process_share.py +++ b/backend/dataall/modules/dataset_sharing/services/share_processors/s3_access_point_process_share.py @@ -212,4 +212,4 @@ def clean_up_share( if not dataset.imported or dataset.importedKmsKey: clean_up_folder.delete_dataset_bucket_key_policy(dataset=dataset) - return True \ No newline at end of file + return True diff --git a/backend/dataall/modules/dataset_sharing/services/share_processors/s3_bucket_process_share.py b/backend/dataall/modules/dataset_sharing/services/share_processors/s3_bucket_process_share.py index 898c138c2..e697755cd 100644 --- a/backend/dataall/modules/dataset_sharing/services/share_processors/s3_bucket_process_share.py +++ b/backend/dataall/modules/dataset_sharing/services/share_processors/s3_bucket_process_share.py @@ -161,9 +161,9 @@ def process_revoked_shares( revoked_item_SM.update_state_single_item(session, removing_item, new_state) except Exception as e: - removing_bucket.handle_revoke_failure(e) new_state = revoked_item_SM.run_transition(ShareItemActions.Failure.value) revoked_item_SM.update_state_single_item(session, removing_item, new_state) success = False + removing_bucket.handle_revoke_failure(e) - return success \ No newline at end of file + return success diff --git a/tests/modules/datasets/tasks/test_s3_bucket_share_manager.py b/tests/modules/datasets/tasks/test_s3_bucket_share_manager.py index 0c39d6afb..1b1b02678 100644 --- a/tests/modules/datasets/tasks/test_s3_bucket_share_manager.py +++ b/tests/modules/datasets/tasks/test_s3_bucket_share_manager.py @@ -1531,4 +1531,4 @@ def test_delete_target_role_bucket_key_policy_with_multiple_principals_in_policy assert 'SomeotherArn' in new_kms_policy["Statement"][0]["Principal"]["AWS"] assert f"arn:aws:iam::{target_environment.AwsAccountId}:role/{target_environment.EnvironmentDefaultIAMRoleName}" not in \ - new_kms_policy["Statement"][0]["Principal"]["AWS"] \ No newline at end of file + new_kms_policy["Statement"][0]["Principal"]["AWS"]