diff --git a/meta.py b/meta.py index 829f60dfa..341b69c5e 100644 --- a/meta.py +++ b/meta.py @@ -13,6 +13,7 @@ import irods_types from deepdiff import DeepDiff +import meta_form import provenance import publication import schema as schema_ @@ -271,7 +272,7 @@ def collection_has_cloneable_metadata(callback, coll): @api.make() def api_meta_remove(ctx, coll): - """Remove a collection's metadata JSON, if it exist.""" + """Remove a collection's metadata JSON, if it exists.""" log.write(ctx, 'Remove metadata of coll {}'.format(coll)) try: @@ -790,3 +791,48 @@ def copy_user_metadata(ctx, source, target): log.write(ctx, "copy_user_metadata: copied user metadata from <{}> to <{}/original>".format(source, target)) except Exception: log.write(ctx, "copy_user_metadata: failed to copy user metadata from <{}> to <{}/original>".format(source, target)) + + +def vault_metadata_matches_schema(ctx, coll_name, schema_cache, report_name): + """Process a single data package to retrieve and validate that its metadata conforms to the schema. + + :param ctx: Combined type of a callback and rei struct + :param coll_name: String representing the data package collection path. + :param schema_cache: Dictionary storing schema blueprints, can be empty. + :param report_name: Name of report script (for logging) + + :returns: A dictionary result containing if schema matches and the schema short name. + """ + metadata_path = get_latest_vault_metadata_path(ctx, coll_name) + + if not metadata_path: + log.write(ctx, "{} skips {}, because metadata could not be found.".format(report_name, coll_name)) + return None + + try: + metadata = jsonutil.read(ctx, metadata_path) + except Exception as exc: + # TODO write_stdout? + log.write(ctx, "{} skips {}, because of exception while reading metadata file {}: {}".format(report_name, coll_name, metadata_path, str(exc))) + return None + + # Determine schema + schema_id = schema_.get_schema_id(ctx, metadata_path) + schema_shortname = schema_id.split("/")[-2] + + # Retrieve schema and cache it for future use + schema_path = schema_.get_schema_path_by_id(ctx, metadata_path, schema_id) + if schema_shortname in schema_cache: + schema_contents = schema_cache[schema_shortname] + else: + schema_contents = jsonutil.read(ctx, schema_path) + schema_cache[schema_shortname] = schema_contents + + # Check whether metadata matches schema and log any errors + error_list = get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents) + match_schema = len(error_list) == 0 + if not match_schema: + errors_formatted = [meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list] + log.write(ctx, "{}: metadata {} did not match schema {}: {}".format(report_name, metadata_path, schema_shortname, str(errors_formatted))) + + return {"schema": schema_shortname, "match_schema": match_schema} diff --git a/schema_transformation.py b/schema_transformation.py index e7d024d3f..7dadd3340 100644 --- a/schema_transformation.py +++ b/schema_transformation.py @@ -19,7 +19,6 @@ import session_vars import meta -import meta_form import schema import schema_transformations from util import * @@ -380,51 +379,6 @@ def html(f): return re.sub('((:param).*)|((:returns:).*)', ' ', description) -def verify_package_schema(ctx, coll_name, schema_cache, report_name): - """Process a single data package to retrieve and validate that its metadata conforms to the schema. - - :param ctx: Combined type of a callback and rei struct - :param coll_name: String representing the data package collection path. - :param schema_cache: Dictionary storing schema blueprints, can be empty. - :param report_name: Name of report script (for logging) - - :returns: A dictionary result containing if schema matches and the schema short name. - """ - metadata_path = meta.get_latest_vault_metadata_path(ctx, coll_name) - - if not metadata_path: - log.write(ctx, "{} skips {}, because metadata could not be found.".format(report_name, coll_name)) - return None - - try: - metadata = jsonutil.read(ctx, metadata_path) - except Exception as exc: - # TODO write_stdout? - log.write(ctx, "{} skips {}, because of exception while reading metadata file {}: {}".format(report_name, coll_name, metadata_path, str(exc))) - return None - - # Determine schema - schema_id = schema.get_schema_id(ctx, metadata_path) - schema_shortname = schema_id.split("/")[-2] - - # Retrieve schema and cache it for future use - schema_path = schema.get_schema_path_by_id(ctx, metadata_path, schema_id) - if schema_shortname in schema_cache: - schema_contents = schema_cache[schema_shortname] - else: - schema_contents = jsonutil.read(ctx, schema_path) - schema_cache[schema_shortname] = schema_contents - - # Check whether metadata matches schema and log any errors - error_list = meta.get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents) - match_schema = len(error_list) == 0 - if not match_schema: - errors_formatted = [meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list] - log.write(ctx, "{}: metadata {} did not match schema {}: {}".format(report_name, metadata_path, schema_shortname, str(errors_formatted))) - - return {"schema": schema_shortname, "match_schema": match_schema} - - @rule.make(inputs=[], outputs=[0]) def rule_batch_vault_metadata_schema_report(ctx): """Show vault metadata schema about each data package in vault @@ -451,7 +405,7 @@ def rule_batch_vault_metadata_schema_report(ctx): for row in iter: try: coll_name = row[0] - result = verify_package_schema(ctx, coll_name, schema_cache, "Vault metadata schema report") + result = meta.vault_metadata_matches_schema(ctx, coll_name, schema_cache, "Vault metadata schema report") if result: results[coll_name] = result except Exception as e: diff --git a/troubleshoot_data.py b/troubleshoot_data.py index d19ac1da7..971f6d42f 100644 --- a/troubleshoot_data.py +++ b/troubleshoot_data.py @@ -13,7 +13,7 @@ import datacite from publication import get_publication_config -from meta import verify_vault_metadata_matches_schema +from meta import vault_metadata_matches_schema from util import * @@ -356,7 +356,7 @@ def rule_batch_troubleshoot_published_data_packages(ctx, requested_package, log_ # Troubleshooting for data_package in data_packages: log.write_stdout(ctx, "Troubleshooting: {}".format(data_package)) - schema_check = verify_vault_metadata_matches_schema(ctx, data_package, schema_cache, "troubleshoot-published-packages")['match_schema'] + schema_check = vault_metadata_matches_schema(ctx, data_package, schema_cache, "troubleshoot-published-packages")['match_schema'] no_missing_avus_check, no_unexpected_avus_check = check_data_package_system_avus(ctx, data_package) version_doi_check, base_doi_check = check_datacite_doi_registration(ctx, data_package) publication_config = get_publication_config(ctx)