Skip to content

Commit

Permalink
Bugfix#932: Investigate why some shares did not go to failed state, b…
Browse files Browse the repository at this point in the history
…ut remained stuck or in-progress
  • Loading branch information
anushka-singh committed Dec 27, 2023
1 parent bed3d51 commit 35b1730
Show file tree
Hide file tree
Showing 11 changed files with 57 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -288,14 +288,18 @@ def update_state(self, session, share_uri, new_state):
return True

def update_state_single_item(self, session, share_item, new_state):
logger.info(f"Updating share item in DB {share_item.shareItemUri} status to {new_state}")
ShareObjectRepository.update_share_item_status(
session=session,
uri=share_item.shareItemUri,
status=new_state
)
self._state = new_state
return True
try:
logger.info(f"Updating share item in DB {share_item.shareItemUri} status to {new_state}")
ShareObjectRepository.update_share_item_status(
session=session,
uri=share_item.shareItemUri,
status=new_state
)
self._state = new_state
return True
except Exception as e:
logger.error(f"Exception during update_state_single_item: {e}")
return False

@staticmethod
def get_share_item_shared_states():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,4 +266,4 @@ def revoke_share(cls, engine: Engine, share_uri: str):
new_share_state = share_sm.run_transition(ShareObjectActions.Finish.value)
share_sm.update_state(session, share, new_share_state)

return revoked_folders_succeed and revoked_s3_buckets_succeed and revoked_tables_succeed
return revoked_folders_succeed and revoked_s3_buckets_succeed and revoked_tables_succeed
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def trigger_table_sharing_failure_alarm(
target_environment: Environment,
):
log.info('Triggering share failure alarm...')
subject = f'ALARM: DATAALL Table {table.GlueTableName} Sharing Failure Notification'[:100]
subject = f'ALARM: DATAALL Sharing Failure Notification for Table {table.GlueTableName}'[:100]
message = f"""
You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to share the table {table.GlueTableName} with Lake Formation.
Expand Down Expand Up @@ -49,7 +49,7 @@ def trigger_revoke_table_sharing_failure_alarm(
target_environment: Environment,
):
log.info('Triggering share failure alarm...')
subject = f'ALARM: DATAALL Table {table.GlueTableName} Revoking LF permissions Failure Notification'[:100]
subject = f'ALARM: DATAALL Revoking LF permissions Failure Notification for Table {table.GlueTableName}'[:100]
message = f"""
You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to revoke Lake Formation permissions for table {table.GlueTableName} with Lake Formation.
Expand All @@ -74,7 +74,7 @@ def trigger_revoke_table_sharing_failure_alarm(

def trigger_dataset_sync_failure_alarm(self, dataset: Dataset, error: str):
log.info(f'Triggering dataset {dataset.name} tables sync failure alarm...')
subject = f'ALARM: DATAALL Dataset {dataset.name} Tables Sync Failure Notification'[:100]
subject = f'ALARM: DATAALL Dataset Tables Sync Failure Notification for {dataset.name}'[:100]
message = f"""
You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to synchronize Dataset {dataset.name} tables from AWS Glue to the Search Catalog.
Expand All @@ -97,7 +97,7 @@ def trigger_folder_sharing_failure_alarm(
target_environment: Environment,
):
log.info('Triggering share failure alarm...')
subject = f'ALARM: DATAALL Folder {folder.S3Prefix} Sharing Failure Notification'[:100]
subject = f'ALARM: DATAALL Folder Sharing Failure Notification for {folder.S3Prefix}'[:100]
message = f"""
You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to share the folder {folder.S3Prefix} with S3 Access Point.
Alarm Details:
Expand All @@ -123,7 +123,7 @@ def trigger_revoke_folder_sharing_failure_alarm(
target_environment: Environment,
):
log.info('Triggering share failure alarm...')
subject = f'ALARM: DATAALL Folder {folder.S3Prefix} Sharing Revoke Failure'[:100]
subject = f'ALARM: DATAALL Folder Sharing Revoke Failure for {folder.S3Prefix}'[:100]
message = f"""
You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to share the folder {folder.S3Prefix} with S3 Access Point.
Alarm Details:
Expand Down Expand Up @@ -165,7 +165,7 @@ def handle_bucket_sharing_failure(self, bucket: DatasetBucket,
target_environment: Environment,
alarm_type: str):
log.info(f'Triggering {alarm_type} failure alarm...')
subject = f'ALARM: DATAALL S3 Bucket {bucket.S3BucketName} {alarm_type} Failure'[:100]
subject = f'ALARM: DATAALL S3 Bucket Failure Notification for {bucket.S3BucketName} {alarm_type}'[:100]
message = f"""
You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to {alarm_type} the S3 Bucket {bucket.S3BucketName}.
Alarm Details:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -480,17 +480,21 @@ def handle_share_failure(
-------
True if alarm published successfully
"""
logging.error(
f'Failed to share table {table.GlueTableName} '
f'from source account {self.source_environment.AwsAccountId}//{self.source_environment.region} '
f'with target account {self.target_environment.AwsAccountId}/{self.target_environment.region}'
f'due to: {error}'
)
try:
logging.error(
f'Failed to share table {table.GlueTableName} '
f'from source account {self.source_environment.AwsAccountId}//{self.source_environment.region} '
f'with target account {self.target_environment.AwsAccountId}/{self.target_environment.region}'
f'due to: {error}'
)

DatasetAlarmService().trigger_table_sharing_failure_alarm(
table, self.share, self.target_environment
)
return True
DatasetAlarmService().trigger_table_sharing_failure_alarm(
table, self.share, self.target_environment
)
return True
except Exception as e:
logger.error(f"Exception during handle_share_failure: {e}")
return False

def handle_revoke_failure(
self,
Expand All @@ -504,16 +508,20 @@ def handle_revoke_failure(
-------
True if alarm published successfully
"""
logger.error(
f'Failed to revoke S3 permissions to table {table.GlueTableName} '
f'from source account {self.source_environment.AwsAccountId}//{self.source_environment.region} '
f'with target account {self.target_environment.AwsAccountId}/{self.target_environment.region} '
f'due to: {error}'
)
DatasetAlarmService().trigger_revoke_table_sharing_failure_alarm(
table, self.share, self.target_environment
)
return True
try:
logger.error(
f'Failed to revoke S3 permissions to table {table.GlueTableName} '
f'from source account {self.source_environment.AwsAccountId}//{self.source_environment.region} '
f'with target account {self.target_environment.AwsAccountId}/{self.target_environment.region} '
f'due to: {error}'
)
DatasetAlarmService().trigger_revoke_table_sharing_failure_alarm(
table, self.share, self.target_environment
)
return True
except Exception as e:
logger.error(f"Exception during handle_revoke_failure: {e}")
return False

def glue_client(self):
return GlueClient(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -553,4 +553,4 @@ def get_principal_list(statement):
principal_list = statement["Principal"]["AWS"]
if isinstance(principal_list, str):
principal_list = [principal_list]
return principal_list
return principal_list
Original file line number Diff line number Diff line change
Expand Up @@ -481,4 +481,4 @@ def generate_default_kms_decrypt_policy_statement(target_requester_arn):
},
"Action": "kms:Decrypt",
"Resource": "*"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,10 @@ def process_approved_shares(self) -> bool:
shared_item_SM.update_state_single_item(self.session, share_item, new_state)

except Exception as e:
self.handle_share_failure(table=table, share_item=share_item, error=e)
new_state = shared_item_SM.run_transition(ShareItemActions.Failure.value)
shared_item_SM.update_state_single_item(self.session, share_item, new_state)
success = False
self.handle_share_failure(table=table, share_item=share_item, error=e)

return success

Expand Down Expand Up @@ -178,9 +178,9 @@ def process_revoked_shares(self) -> bool:
revoked_item_SM.update_state_single_item(self.session, share_item, new_state)

except Exception as e:
self.handle_revoke_failure(share_item=share_item, table=table, error=e)
new_state = revoked_item_SM.run_transition(ShareItemActions.Failure.value)
revoked_item_SM.update_state_single_item(self.session, share_item, new_state)
success = False
self.handle_revoke_failure(share_item=share_item, table=table, error=e)

return success
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ def process_approved_shares(self) -> bool:
shared_item_SM.update_state_single_item(self.session, share_item, new_state)

except Exception as e:
self.handle_share_failure(table, share_item, e)
new_state = shared_item_SM.run_transition(ShareItemActions.Failure.value)
shared_item_SM.update_state_single_item(self.session, share_item, new_state)
success = False
self.handle_share_failure(table, share_item, e)

return success

Expand Down Expand Up @@ -151,9 +151,9 @@ def process_revoked_shares(self) -> bool:
revoked_item_SM.update_state_single_item(self.session, share_item, new_state)

except Exception as e:
self.handle_revoke_failure(share_item, table, e)
new_state = revoked_item_SM.run_transition(ShareItemActions.Failure.value)
revoked_item_SM.update_state_single_item(self.session, share_item, new_state)
success = False
self.handle_revoke_failure(share_item, table, e)

return success
Original file line number Diff line number Diff line change
Expand Up @@ -212,4 +212,4 @@ def clean_up_share(
if not dataset.imported or dataset.importedKmsKey:
clean_up_folder.delete_dataset_bucket_key_policy(dataset=dataset)

return True
return True
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,9 @@ def process_revoked_shares(
revoked_item_SM.update_state_single_item(session, removing_item, new_state)

except Exception as e:
removing_bucket.handle_revoke_failure(e)
new_state = revoked_item_SM.run_transition(ShareItemActions.Failure.value)
revoked_item_SM.update_state_single_item(session, removing_item, new_state)
success = False
removing_bucket.handle_revoke_failure(e)

return success
return success
Original file line number Diff line number Diff line change
Expand Up @@ -1531,4 +1531,4 @@ def test_delete_target_role_bucket_key_policy_with_multiple_principals_in_policy

assert 'SomeotherArn' in new_kms_policy["Statement"][0]["Principal"]["AWS"]
assert f"arn:aws:iam::{target_environment.AwsAccountId}:role/{target_environment.EnvironmentDefaultIAMRoleName}" not in \
new_kms_policy["Statement"][0]["Principal"]["AWS"]
new_kms_policy["Statement"][0]["Principal"]["AWS"]

0 comments on commit 35b1730

Please sign in to comment.