Skip to content

Commit

Permalink
Move verify vault package metadata function to meta.py
Browse files Browse the repository at this point in the history
  • Loading branch information
claravox committed Sep 9, 2024
1 parent c134efb commit a082ea8
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 50 deletions.
48 changes: 47 additions & 1 deletion meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import irods_types
from deepdiff import DeepDiff

import meta_form
import provenance
import publication
import schema as schema_
Expand Down Expand Up @@ -271,7 +272,7 @@ def collection_has_cloneable_metadata(callback, coll):

@api.make()
def api_meta_remove(ctx, coll):
"""Remove a collection's metadata JSON, if it exist."""
"""Remove a collection's metadata JSON, if it exists."""
log.write(ctx, 'Remove metadata of coll {}'.format(coll))

try:
Expand Down Expand Up @@ -790,3 +791,48 @@ def copy_user_metadata(ctx, source, target):
log.write(ctx, "copy_user_metadata: copied user metadata from <{}> to <{}/original>".format(source, target))
except Exception:
log.write(ctx, "copy_user_metadata: failed to copy user metadata from <{}> to <{}/original>".format(source, target))


def vault_metadata_matches_schema(ctx, coll_name, schema_cache, report_name):
"""Process a single data package to retrieve and validate that its metadata conforms to the schema.
:param ctx: Combined type of a callback and rei struct
:param coll_name: String representing the data package collection path.
:param schema_cache: Dictionary storing schema blueprints, can be empty.
:param report_name: Name of report script (for logging)
:returns: A dictionary result containing if schema matches and the schema short name.
"""
metadata_path = get_latest_vault_metadata_path(ctx, coll_name)

if not metadata_path:
log.write(ctx, "{} skips {}, because metadata could not be found.".format(report_name, coll_name))
return None

try:
metadata = jsonutil.read(ctx, metadata_path)
except Exception as exc:
# TODO write_stdout?
log.write(ctx, "{} skips {}, because of exception while reading metadata file {}: {}".format(report_name, coll_name, metadata_path, str(exc)))
return None

# Determine schema
schema_id = schema_.get_schema_id(ctx, metadata_path)
schema_shortname = schema_id.split("/")[-2]

# Retrieve schema and cache it for future use
schema_path = schema_.get_schema_path_by_id(ctx, metadata_path, schema_id)
if schema_shortname in schema_cache:
schema_contents = schema_cache[schema_shortname]
else:
schema_contents = jsonutil.read(ctx, schema_path)
schema_cache[schema_shortname] = schema_contents

# Check whether metadata matches schema and log any errors
error_list = get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents)
match_schema = len(error_list) == 0
if not match_schema:
errors_formatted = [meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list]
log.write(ctx, "{}: metadata {} did not match schema {}: {}".format(report_name, metadata_path, schema_shortname, str(errors_formatted)))

return {"schema": schema_shortname, "match_schema": match_schema}
48 changes: 1 addition & 47 deletions schema_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import session_vars

import meta
import meta_form
import schema
import schema_transformations
from util import *
Expand Down Expand Up @@ -380,51 +379,6 @@ def html(f):
return re.sub('((:param).*)|((:returns:).*)', ' ', description)


def verify_package_schema(ctx, coll_name, schema_cache, report_name):
"""Process a single data package to retrieve and validate that its metadata conforms to the schema.
:param ctx: Combined type of a callback and rei struct
:param coll_name: String representing the data package collection path.
:param schema_cache: Dictionary storing schema blueprints, can be empty.
:param report_name: Name of report script (for logging)
:returns: A dictionary result containing if schema matches and the schema short name.
"""
metadata_path = meta.get_latest_vault_metadata_path(ctx, coll_name)

if not metadata_path:
log.write(ctx, "{} skips {}, because metadata could not be found.".format(report_name, coll_name))
return None

try:
metadata = jsonutil.read(ctx, metadata_path)
except Exception as exc:
# TODO write_stdout?
log.write(ctx, "{} skips {}, because of exception while reading metadata file {}: {}".format(report_name, coll_name, metadata_path, str(exc)))
return None

# Determine schema
schema_id = schema.get_schema_id(ctx, metadata_path)
schema_shortname = schema_id.split("/")[-2]

# Retrieve schema and cache it for future use
schema_path = schema.get_schema_path_by_id(ctx, metadata_path, schema_id)
if schema_shortname in schema_cache:
schema_contents = schema_cache[schema_shortname]
else:
schema_contents = jsonutil.read(ctx, schema_path)
schema_cache[schema_shortname] = schema_contents

# Check whether metadata matches schema and log any errors
error_list = meta.get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents)
match_schema = len(error_list) == 0
if not match_schema:
errors_formatted = [meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list]
log.write(ctx, "{}: metadata {} did not match schema {}: {}".format(report_name, metadata_path, schema_shortname, str(errors_formatted)))

return {"schema": schema_shortname, "match_schema": match_schema}


@rule.make(inputs=[], outputs=[0])
def rule_batch_vault_metadata_schema_report(ctx):
"""Show vault metadata schema about each data package in vault
Expand All @@ -451,7 +405,7 @@ def rule_batch_vault_metadata_schema_report(ctx):
for row in iter:
try:
coll_name = row[0]
result = verify_package_schema(ctx, coll_name, schema_cache, "Vault metadata schema report")
result = meta.vault_metadata_matches_schema(ctx, coll_name, schema_cache, "Vault metadata schema report")
if result:
results[coll_name] = result
except Exception as e:
Expand Down
4 changes: 2 additions & 2 deletions troubleshoot_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import datacite
from publication import get_publication_config
from meta import verify_vault_metadata_matches_schema
from meta import vault_metadata_matches_schema
from util import *


Expand Down Expand Up @@ -356,7 +356,7 @@ def rule_batch_troubleshoot_published_data_packages(ctx, requested_package, log_
# Troubleshooting
for data_package in data_packages:
log.write_stdout(ctx, "Troubleshooting: {}".format(data_package))
schema_check = verify_vault_metadata_matches_schema(ctx, data_package, schema_cache, "troubleshoot-published-packages")['match_schema']
schema_check = vault_metadata_matches_schema(ctx, data_package, schema_cache, "troubleshoot-published-packages")['match_schema']
no_missing_avus_check, no_unexpected_avus_check = check_data_package_system_avus(ctx, data_package)
version_doi_check, base_doi_check = check_datacite_doi_registration(ctx, data_package)
publication_config = get_publication_config(ctx)
Expand Down

0 comments on commit a082ea8

Please sign in to comment.