Skip to content

Commit

Permalink
Strip trailing tabs in embedded YAML header.
Browse files Browse the repository at this point in the history
When extracting the embedded YAML header from a SSSOM/TSV file, remove
any trailing tabs at the end of the header lines. Such tabs, which may
have been inserted by a SSSOM-unaware spreadsheet editor, would prevent
the YAML metadata block from being parsed correctly.

If a metadata slot _really_ needs to end with some tabs, it is still
possible to quote the slot value to prevent those tabs from being
stripped.

closes #566
  • Loading branch information
gouttegd committed Dec 13, 2024
1 parent 02d6005 commit 97b0039
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 1 deletion.
6 changes: 5 additions & 1 deletion src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,11 @@ def _separate_metadata_and_table_from_stream(s: io.StringIO):
if header_section:
header_section = False
elif header_section:
metadata_component.write(line)
# We strip any trailing tabs. Such tabs may have been left
# by a spreadsheet editor who treated the header lines as
# if they were normal data lines; they would prevent the
# YAML parser from correctly parsing the metadata block.
metadata_component.write(line.rstrip("\t\n") + "\n")
else:
logging.info(
f"Line {line} is starting with hash symbol, but header section is already passed. "
Expand Down
21 changes: 21 additions & 0 deletions tests/data/trailing-tabs.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#curie_map:
# COMENT: https://example.com/entities/
# COMPID: https://example.com/people/
# ORGENT: https://example.org/entities/
# ORGPID: https://example.org/people/
#mapping_set_id: https://example.org/sets/exo2c
#mapping_set_title: O2C set
#creator_id:
# - ORGPID:0000-0000-0001-1234
# - COMPID:0000-0000-0002-5678
#license: https://creativecommons.org/licenses/by/4.0/
#publication_date: 2023-09-13
subject_id subject_label predicate_id object_id object_label mapping_justification
ORGENT:0001 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration
ORGENT:0002 bob skos:closeMatch COMENT:0012 beta semapv:ManualMappingCuration
ORGENT:0004 daphne skos:closeMatch COMENT:0014 delta semapv:ManualMappingCuration
ORGENT:0005 eve skos:closeMatch COMENT:0015 epsilon semapv:ManualMappingCuration
ORGENT:0006 fanny skos:closeMatch COMENT:0016 zeta semapv:ManualMappingCuration
ORGENT:0007 gavin skos:exactMatch COMENT:0013 gamma semapv:ManualMappingCuration
ORGENT:0008 hector skos:closeMatch COMENT:0017 eta semapv:ManualMappingCuration
ORGENT:0009 ivan skos:exactMatch COMENT:0019 iota semapv:ManualMappingCuration
11 changes: 11 additions & 0 deletions tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,17 @@ def test_parse_obographs_merged(self):
msdf = parse_sssom_table(outfile)
self.assertTrue(custom_curie_map.items() <= msdf.prefix_map.items())

def test_parse_trailing_tabs_in_metadata_header(self):
"""Test parsing a file containing trailing tabs in header."""
input_path = f"{test_data_dir}/trailing-tabs.sssom.tsv"
msdf = parse_sssom_table(input_path)
self.assertEqual(msdf.metadata["mapping_set_id"], "https://example.org/sets/exo2c")
self.assertEqual(
len(msdf.df),
8,
f"{input_path} has the wrong number of mappings.",
)


class TestParseExplicit(unittest.TestCase):
"""This test case contains explicit tests for parsing."""
Expand Down

0 comments on commit 97b0039

Please sign in to comment.