From bb4e1433e93ebf1ccd69fe20d6580edeb3e100c0 Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Sat, 7 Dec 2024 10:50:45 +0200 Subject: [PATCH 1/5] Add test to ensure that parsing of wrong TSV files fails in strict mode This is very crude. Ideally I would add tests for every case it could fail but way over time limit now. --- tests/data/basic_strict_fail.tsv | 15 +++++++++++++++ tests/test_parsers.py | 14 ++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 tests/data/basic_strict_fail.tsv diff --git a/tests/data/basic_strict_fail.tsv b/tests/data/basic_strict_fail.tsv new file mode 100644 index 00000000..50680da4 --- /dev/null +++ b/tests/data/basic_strict_fail.tsv @@ -0,0 +1,15 @@ +# curie_map: +# HP: http://purl.obolibrary.org/obo/HP_ +# MP: http://purl.obolibrary.org/obo/MP_ +# owl: http://www.w3.org/2002/07/owl# +# rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# +# rdfs: http://www.w3.org/2000/01/rdf-schema_fail# +# semapv: https://w3id.org/semapv/vocab/ +# skos: http://www.w3.org/2004/02/skos/core# +# sssom: https://w3id.org/sssom/ +# license_fail: https://creativecommons.org/publicdomain/zero/1.0/ +# mapping_provider: http://purl.obolibrary.org/obo/upheno.owl +# mapping_set_id: https://w3id.org/sssom/mappings/27f85fe9-8a72-4e76-909b-7ba4244d9ede +subject_id subject_label predicate_id object_id object_label mapping_fail_justification +HP:0000175 Cleft palate skos:exactMatch MP:0000111 cleft palate semapv:LexicalMatching +HP:0000252 Microcephaly skos:exactMatch MP:0000433 microcephaly semapv:LexicalMatching diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 3947bb1d..45deecb3 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -447,3 +447,17 @@ def test_round_trip_rdf(self): def test_round_trip_tsv(self): """Test writing then reading TSV.""" self._basic_round_trip("tsv") + + def test_strict_parsing(self): + """Test Strict parsing mode.""" + input_path = f"{test_data_dir}/basic_strict_fail.tsv" + with open(input_path, "r") as file: + input_string = file.read() + stream = io.StringIO(input_string) + + with self.assertRaises(ValueError): + parse_sssom_table(stream, strict=True) + + # Make sure it parses in non-strict mode + msdf = parse_sssom_table(stream) + self.assertEqual(len(msdf.df), 2) From ca01b56b422332c9076a3d3472ea98db25c4c829 Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Sat, 7 Dec 2024 11:05:41 +0200 Subject: [PATCH 2/5] Extend parse_sssom_table to report wrong prefixes and metadata This also comes with the introduction of a strict mode, which allows draconian failing in cases where small violations are encountered (by default, we drop unknown metadata elements, and redefined built in prefixes). So the idea is: Look for some basic errors before the actual parsing occurs, fail if in strict mode, if not proceed to normal mode with repair. --- src/sssom/parsers.py | 97 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 28f96b43..45ae11e4 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -182,6 +182,97 @@ def _get_seperator_symbol_from_file_path(file): return None +def _is_check_valid_extension_slot(slot_name): + logging.warning( + f"'{slot_name}' could be a valid extension slot " + f"(https://mapping-commons.github.io/sssom/spec-model/#non-standard-slots), " + f"but the validator does not check that yet." + ) + return False + + +def _check_irregular_metadata(sssom_metadata, meta): + fail_metadata = False + for m in [sssom_metadata, meta]: + for key in m: + if (key not in _get_sssom_schema_object().mapping_set_slots) and ( + not _is_check_valid_extension_slot(key) + ): + logging.warning( + f"Metadata key '{key}' is not a standard SSSOM mapping set metadata field." + ) + fail_metadata = True + return fail_metadata + + +def _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map): + + # There are three ways in which prefixes can be communicated, so we will check all of them + # This is a bit overly draconian, as in the end, only the highest priority one gets picked + # But since this only constitues a (logging) warning, I think its worth reporting + builtin_converter = _get_built_in_prefix_map() + sssom_metadata_converter = _get_converter_pop_replace_curie_map(sssom_metadata) + meta_converter = _get_converter_pop_replace_curie_map(meta) + prefix_map_converter = ensure_converter(prefix_map, use_defaults=False) + is_valid_prefixes = True + + for converter in [sssom_metadata_converter, meta_converter, prefix_map_converter]: + for builtin_prefix, builtin_uri in builtin_converter.bimap.items(): + if builtin_prefix in converter.bimap: + if builtin_uri != converter.bimap[builtin_prefix]: + logging.warning( + f"A built-in prefix ({builtin_prefix}) was provided, " + f"but the provided URI expansion ({converter.bimap[builtin_prefix]}) does not correspond " + f"to the required URI expansion: {builtin_uri}. The prefix will be ignored." + ) + is_valid_prefixes = False + reverse_bimap = {value: key for key, value in builtin_converter.bimap.items()} + if builtin_uri in reverse_bimap: + if builtin_prefix != reverse_bimap[builtin_uri]: + logging.warning( + f"A built-in URI namespace ({builtin_uri}) was used in (one of) the provided prefix map(s), " + f"but the provided prefix ({reverse_bimap[builtin_uri]}) does not correspond to the " + f"standard prefix: {builtin_prefix}. The prefix will be ignored." + ) + is_valid_prefixes = False + return is_valid_prefixes + + +def _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata): + report = "" + if not is_valid_built_in_prefixes: + report += "STRONG WARNING: The prefix map provided contains built-in prefixes that were redefined.+\n" + if not is_valid_metadata: + report += ( + "STRONG WARNING: The metadata provided contains non-standard and undefined metadata.+\n" + ) + + if report: + raise ValueError(report) + + +def _get_converter_pop_replace_curie_map(sssom_metadata): + """ + Pop CURIE_MAP from sssom_metadata, process it, and restore it if it existed. + + Args: + sssom_metadata (dict): The metadata dictionary. + + Returns: + Converter: A Converter object created from the CURIE_MAP. + """ + curie_map = sssom_metadata.pop(CURIE_MAP, {}) + + # Process the popped value + sssom_metadata_converter = Converter.from_prefix_map(curie_map) + + # Reinsert CURIE_MAP if it was present + if curie_map: + sssom_metadata[CURIE_MAP] = curie_map + + return sssom_metadata_converter + + def parse_sssom_table( file_path: Union[str, Path, TextIO], prefix_map: ConverterHint = None, @@ -197,6 +288,12 @@ def parse_sssom_table( if meta is None: meta = {} + is_valid_built_in_prefixes = _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map) + is_valid_metadata = _check_irregular_metadata(sssom_metadata, meta) + + if kwargs.get("strict"): + _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata) + # The priority order for combining prefix maps are: # 1. Built-in prefix map # 2. Internal prefix map inside the document From 9847b67391a2d07e55815c6bddfb44d3293fdfc3 Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Sat, 7 Dec 2024 14:14:31 +0200 Subject: [PATCH 3/5] Adding support for "extension_slots" during parsing I previously failed parsing in strict mode when a non-standard metadata element was encountered on mapping_set level. This here adds support for the case that someone legally specified an extension slot according to https://w3id.org/sssom/spec). --- src/sssom/parsers.py | 32 +++++++++++++++----------------- tests/test_parsers.py | 23 +++++++++++++++++++++++ 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 45ae11e4..6191c7eb 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -182,26 +182,23 @@ def _get_seperator_symbol_from_file_path(file): return None -def _is_check_valid_extension_slot(slot_name): - logging.warning( - f"'{slot_name}' could be a valid extension slot " - f"(https://mapping-commons.github.io/sssom/spec-model/#non-standard-slots), " - f"but the validator does not check that yet." - ) - return False +def _is_check_valid_extension_slot(slot_name, meta): + extension_definitions = meta.get("extension_definitions", []) + return any(entry.get("slot_name") == slot_name for entry in extension_definitions) -def _check_irregular_metadata(sssom_metadata, meta): +def _is_irregular_metadata(metadata_list: List[Dict]): fail_metadata = False - for m in [sssom_metadata, meta]: + for m in metadata_list: for key in m: - if (key not in _get_sssom_schema_object().mapping_set_slots) and ( - not _is_check_valid_extension_slot(key) - ): - logging.warning( - f"Metadata key '{key}' is not a standard SSSOM mapping set metadata field." - ) - fail_metadata = True + if key not in _get_sssom_schema_object().mapping_set_slots: + if not _is_check_valid_extension_slot(key, m): + logging.warning( + f"Metadata key '{key}' is not a standard SSSOM mapping set metadata field. See " + f"https://mapping-commons.github.io/sssom/spec-model/#non-standard-slots on how to " + f"specify additional, non-standard fields in a SSSOM file." + ) + fail_metadata = True return fail_metadata @@ -226,6 +223,7 @@ def _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map): f"to the required URI expansion: {builtin_uri}. The prefix will be ignored." ) is_valid_prefixes = False + # NOTE during refactor replace the following line by https://github.com/biopragmatics/curies/pull/136 reverse_bimap = {value: key for key, value in builtin_converter.bimap.items()} if builtin_uri in reverse_bimap: if builtin_prefix != reverse_bimap[builtin_uri]: @@ -289,7 +287,7 @@ def parse_sssom_table( meta = {} is_valid_built_in_prefixes = _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map) - is_valid_metadata = _check_irregular_metadata(sssom_metadata, meta) + is_valid_metadata = _is_irregular_metadata([sssom_metadata, meta]) if kwargs.get("strict"): _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 45deecb3..c3182acc 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -461,3 +461,26 @@ def test_strict_parsing(self): # Make sure it parses in non-strict mode msdf = parse_sssom_table(stream) self.assertEqual(len(msdf.df), 2) + + def test_check_irregular_metadata(self): + """Test if irregular metadata check works according to https://w3id.org/sssom/spec.""" + meta_fail = { + "licenses": "http://licen.se", + "mapping_set_id": "http://mapping.set/id1", + "ext_test": "value", + } + meta_ok = { + "license": "http://licen.se", + "mapping_set_id": "http://mapping.set/id1", + "ext_test": "value", + "extension_definitions": [{"slot_name": "ext_test"}], + } + + from sssom.parsers import _is_check_valid_extension_slot, _is_irregular_metadata + + is_irregular_metadata_fail_case = _is_irregular_metadata([meta_fail]) + is_valid_extension = _is_check_valid_extension_slot("ext_test", meta_ok) + is_irregular_metadata_ok_case = _is_irregular_metadata([meta_ok]) + self.assertTrue(is_irregular_metadata_fail_case) + self.assertTrue(is_valid_extension) + self.assertFalse(is_irregular_metadata_ok_case) From 57d24f9bbdaf7b640e0d9d7dc6e4ff04cdaf9ec2 Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Sat, 7 Dec 2024 20:45:30 +0200 Subject: [PATCH 4/5] Amended the code to also look for "property" in extension slot definitions Updated test case to show this works. --- src/sssom/parsers.py | 3 +-- tests/test_parsers.py | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 6191c7eb..23d6535c 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -184,8 +184,7 @@ def _get_seperator_symbol_from_file_path(file): def _is_check_valid_extension_slot(slot_name, meta): extension_definitions = meta.get("extension_definitions", []) - return any(entry.get("slot_name") == slot_name for entry in extension_definitions) - + return any("property" in entry and entry.get("slot_name") == slot_name for entry in extension_definitions) def _is_irregular_metadata(metadata_list: List[Dict]): fail_metadata = False diff --git a/tests/test_parsers.py b/tests/test_parsers.py index c3182acc..019a807f 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -464,23 +464,32 @@ def test_strict_parsing(self): def test_check_irregular_metadata(self): """Test if irregular metadata check works according to https://w3id.org/sssom/spec.""" - meta_fail = { + meta_fail_because_undeclared_extension = { "licenses": "http://licen.se", "mapping_set_id": "http://mapping.set/id1", "ext_test": "value", } - meta_ok = { + meta_fail_because_extension_without_property = { "license": "http://licen.se", "mapping_set_id": "http://mapping.set/id1", "ext_test": "value", "extension_definitions": [{"slot_name": "ext_test"}], } + meta_ok = { + "license": "http://licen.se", + "mapping_set_id": "http://mapping.set/id1", + "ext_test": "value", + "extension_definitions": [{"slot_name": "ext_test", "property": "skos:fantasyRelation"}], + } + from sssom.parsers import _is_check_valid_extension_slot, _is_irregular_metadata - is_irregular_metadata_fail_case = _is_irregular_metadata([meta_fail]) + is_irregular_metadata_fail_undeclared_case = _is_irregular_metadata([meta_fail_because_undeclared_extension]) is_valid_extension = _is_check_valid_extension_slot("ext_test", meta_ok) is_irregular_metadata_ok_case = _is_irregular_metadata([meta_ok]) - self.assertTrue(is_irregular_metadata_fail_case) + is_irregular_metadata_fail_missing_property_case = _is_irregular_metadata([meta_fail_because_extension_without_property]) + self.assertTrue(is_irregular_metadata_fail_undeclared_case) + self.assertTrue(is_irregular_metadata_fail_missing_property_case) self.assertTrue(is_valid_extension) self.assertFalse(is_irregular_metadata_ok_case) From 34856f1c394f9ea9e5d5a23759a62a3605fdf703 Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Sat, 7 Dec 2024 20:56:21 +0200 Subject: [PATCH 5/5] Blacken the code. --- src/sssom/parsers.py | 6 +++++- tests/test_parsers.py | 12 +++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 23d6535c..81a5b37c 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -184,7 +184,11 @@ def _get_seperator_symbol_from_file_path(file): def _is_check_valid_extension_slot(slot_name, meta): extension_definitions = meta.get("extension_definitions", []) - return any("property" in entry and entry.get("slot_name") == slot_name for entry in extension_definitions) + return any( + "property" in entry and entry.get("slot_name") == slot_name + for entry in extension_definitions + ) + def _is_irregular_metadata(metadata_list: List[Dict]): fail_metadata = False diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 019a807f..f29d9e48 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -480,15 +480,21 @@ def test_check_irregular_metadata(self): "license": "http://licen.se", "mapping_set_id": "http://mapping.set/id1", "ext_test": "value", - "extension_definitions": [{"slot_name": "ext_test", "property": "skos:fantasyRelation"}], + "extension_definitions": [ + {"slot_name": "ext_test", "property": "skos:fantasyRelation"} + ], } from sssom.parsers import _is_check_valid_extension_slot, _is_irregular_metadata - is_irregular_metadata_fail_undeclared_case = _is_irregular_metadata([meta_fail_because_undeclared_extension]) + is_irregular_metadata_fail_undeclared_case = _is_irregular_metadata( + [meta_fail_because_undeclared_extension] + ) is_valid_extension = _is_check_valid_extension_slot("ext_test", meta_ok) is_irregular_metadata_ok_case = _is_irregular_metadata([meta_ok]) - is_irregular_metadata_fail_missing_property_case = _is_irregular_metadata([meta_fail_because_extension_without_property]) + is_irregular_metadata_fail_missing_property_case = _is_irregular_metadata( + [meta_fail_because_extension_without_property] + ) self.assertTrue(is_irregular_metadata_fail_undeclared_case) self.assertTrue(is_irregular_metadata_fail_missing_property_case) self.assertTrue(is_valid_extension)