Skip to content

Commit

Permalink
Merge pull request #565 from mapping-commons/report-prefix-issues
Browse files Browse the repository at this point in the history
Extend parse_sssom_table to report wrong prefixes and metadata
  • Loading branch information
matentzn authored Dec 7, 2024
2 parents e625411 + 34856f1 commit e596d70
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 0 deletions.
98 changes: 98 additions & 0 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,98 @@ def _get_seperator_symbol_from_file_path(file):
return None


def _is_check_valid_extension_slot(slot_name, meta):
extension_definitions = meta.get("extension_definitions", [])
return any(
"property" in entry and entry.get("slot_name") == slot_name
for entry in extension_definitions
)


def _is_irregular_metadata(metadata_list: List[Dict]):
fail_metadata = False
for m in metadata_list:
for key in m:
if key not in _get_sssom_schema_object().mapping_set_slots:
if not _is_check_valid_extension_slot(key, m):
logging.warning(
f"Metadata key '{key}' is not a standard SSSOM mapping set metadata field. See "
f"https://mapping-commons.github.io/sssom/spec-model/#non-standard-slots on how to "
f"specify additional, non-standard fields in a SSSOM file."
)
fail_metadata = True
return fail_metadata


def _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map):

# There are three ways in which prefixes can be communicated, so we will check all of them
# This is a bit overly draconian, as in the end, only the highest priority one gets picked
# But since this only constitues a (logging) warning, I think its worth reporting
builtin_converter = _get_built_in_prefix_map()
sssom_metadata_converter = _get_converter_pop_replace_curie_map(sssom_metadata)
meta_converter = _get_converter_pop_replace_curie_map(meta)
prefix_map_converter = ensure_converter(prefix_map, use_defaults=False)
is_valid_prefixes = True

for converter in [sssom_metadata_converter, meta_converter, prefix_map_converter]:
for builtin_prefix, builtin_uri in builtin_converter.bimap.items():
if builtin_prefix in converter.bimap:
if builtin_uri != converter.bimap[builtin_prefix]:
logging.warning(
f"A built-in prefix ({builtin_prefix}) was provided, "
f"but the provided URI expansion ({converter.bimap[builtin_prefix]}) does not correspond "
f"to the required URI expansion: {builtin_uri}. The prefix will be ignored."
)
is_valid_prefixes = False
# NOTE during refactor replace the following line by https://github.com/biopragmatics/curies/pull/136
reverse_bimap = {value: key for key, value in builtin_converter.bimap.items()}
if builtin_uri in reverse_bimap:
if builtin_prefix != reverse_bimap[builtin_uri]:
logging.warning(
f"A built-in URI namespace ({builtin_uri}) was used in (one of) the provided prefix map(s), "
f"but the provided prefix ({reverse_bimap[builtin_uri]}) does not correspond to the "
f"standard prefix: {builtin_prefix}. The prefix will be ignored."
)
is_valid_prefixes = False
return is_valid_prefixes


def _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata):
report = ""
if not is_valid_built_in_prefixes:
report += "STRONG WARNING: The prefix map provided contains built-in prefixes that were redefined.+\n"
if not is_valid_metadata:
report += (
"STRONG WARNING: The metadata provided contains non-standard and undefined metadata.+\n"
)

if report:
raise ValueError(report)


def _get_converter_pop_replace_curie_map(sssom_metadata):
"""
Pop CURIE_MAP from sssom_metadata, process it, and restore it if it existed.
Args:
sssom_metadata (dict): The metadata dictionary.
Returns:
Converter: A Converter object created from the CURIE_MAP.
"""
curie_map = sssom_metadata.pop(CURIE_MAP, {})

# Process the popped value
sssom_metadata_converter = Converter.from_prefix_map(curie_map)

# Reinsert CURIE_MAP if it was present
if curie_map:
sssom_metadata[CURIE_MAP] = curie_map

return sssom_metadata_converter


def parse_sssom_table(
file_path: Union[str, Path, TextIO],
prefix_map: ConverterHint = None,
Expand All @@ -197,6 +289,12 @@ def parse_sssom_table(
if meta is None:
meta = {}

is_valid_built_in_prefixes = _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map)
is_valid_metadata = _is_irregular_metadata([sssom_metadata, meta])

if kwargs.get("strict"):
_fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata)

# The priority order for combining prefix maps are:
# 1. Built-in prefix map
# 2. Internal prefix map inside the document
Expand Down
15 changes: 15 additions & 0 deletions tests/data/basic_strict_fail.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# curie_map:
# HP: http://purl.obolibrary.org/obo/HP_
# MP: http://purl.obolibrary.org/obo/MP_
# owl: http://www.w3.org/2002/07/owl#
# rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
# rdfs: http://www.w3.org/2000/01/rdf-schema_fail#
# semapv: https://w3id.org/semapv/vocab/
# skos: http://www.w3.org/2004/02/skos/core#
# sssom: https://w3id.org/sssom/
# license_fail: https://creativecommons.org/publicdomain/zero/1.0/
# mapping_provider: http://purl.obolibrary.org/obo/upheno.owl
# mapping_set_id: https://w3id.org/sssom/mappings/27f85fe9-8a72-4e76-909b-7ba4244d9ede
subject_id subject_label predicate_id object_id object_label mapping_fail_justification
HP:0000175 Cleft palate skos:exactMatch MP:0000111 cleft palate semapv:LexicalMatching
HP:0000252 Microcephaly skos:exactMatch MP:0000433 microcephaly semapv:LexicalMatching
52 changes: 52 additions & 0 deletions tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,3 +447,55 @@ def test_round_trip_rdf(self):
def test_round_trip_tsv(self):
"""Test writing then reading TSV."""
self._basic_round_trip("tsv")

def test_strict_parsing(self):
"""Test Strict parsing mode."""
input_path = f"{test_data_dir}/basic_strict_fail.tsv"
with open(input_path, "r") as file:
input_string = file.read()
stream = io.StringIO(input_string)

with self.assertRaises(ValueError):
parse_sssom_table(stream, strict=True)

# Make sure it parses in non-strict mode
msdf = parse_sssom_table(stream)
self.assertEqual(len(msdf.df), 2)

def test_check_irregular_metadata(self):
"""Test if irregular metadata check works according to https://w3id.org/sssom/spec."""
meta_fail_because_undeclared_extension = {
"licenses": "http://licen.se",
"mapping_set_id": "http://mapping.set/id1",
"ext_test": "value",
}
meta_fail_because_extension_without_property = {
"license": "http://licen.se",
"mapping_set_id": "http://mapping.set/id1",
"ext_test": "value",
"extension_definitions": [{"slot_name": "ext_test"}],
}

meta_ok = {
"license": "http://licen.se",
"mapping_set_id": "http://mapping.set/id1",
"ext_test": "value",
"extension_definitions": [
{"slot_name": "ext_test", "property": "skos:fantasyRelation"}
],
}

from sssom.parsers import _is_check_valid_extension_slot, _is_irregular_metadata

is_irregular_metadata_fail_undeclared_case = _is_irregular_metadata(
[meta_fail_because_undeclared_extension]
)
is_valid_extension = _is_check_valid_extension_slot("ext_test", meta_ok)
is_irregular_metadata_ok_case = _is_irregular_metadata([meta_ok])
is_irregular_metadata_fail_missing_property_case = _is_irregular_metadata(
[meta_fail_because_extension_without_property]
)
self.assertTrue(is_irregular_metadata_fail_undeclared_case)
self.assertTrue(is_irregular_metadata_fail_missing_property_case)
self.assertTrue(is_valid_extension)
self.assertFalse(is_irregular_metadata_ok_case)

0 comments on commit e596d70

Please sign in to comment.