Enable parsing of loaded Dataframes. Improve function naming. #82

mbaric758 · 2024-12-19T10:19:38Z

Add option to pass loaded data frame : mapping file dictionary to extraction functions.
Update naming of said functions to better reflect their usage and update README and docstring.
Update Read the Docs configuration to use Python13 and new function naming.
TODO: After merging, usages of extract and tabular.extract_all functions should be renamed in OncodashKB.

renaming of extract_all function. diff --git a/src/ontoweaver/__init__.py b/src/ontoweaver/__init__.py index a3cd4e8..9069eeb 100644 --- a/src/ontoweaver/__init__.py +++ b/src/ontoweaver/__init__.py @@ -1,8 +1,9 @@ -from typing import Tuple +from typing import Tuple, Optional import biocypher import yaml import pandas as pd +from networkx.classes import nodes from . import base Node = base.Node @@ -23,16 +24,17 @@ from . import fusion __all__ = ['Node', 'Edge', 'Transformer', 'Adapter', 'All', 'tabular', 'types', 'transformer', 'serialize', 'congregate', 'merge', 'fuse', 'fusion'] -def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings, parallel_mapping = 0, separator = None, affix = "none", affix_separator = ":"): - """Calls several mappings, each on the related Pandas-redable tabular data file, +def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings = None, loaded_data_mappings = None, parallel_mapping = 0, separator = None, affix = "none", affix_separator = ":"): + """Calls several mappings, each on the related Pandas-readable tabular data file, then reconciliate duplicated nodes and edges (on nodes' IDs, merging properties in lists), then export everything with BioCypher. Returns the path to the resulting import file. Args: - biocypher_config_path: the BioCypher configuration file - schema_path: the assembling schema file - data_mappings: a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them + biocypher_config_path: the BioCypher configuration file. + schema_path: the assembling schema file. + data_mappings: a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them. + loaded_data_mappings: a dictionary mapping loaded Pandas data frame to the mapping yaml file. parallel_mapping (int): Number of workers to use in parallel mapping. Defaults to 0 for sequential processing. separator (str, optional): The separator to use for combining values in reconciliation. Defaults to None. affix (str, optional): The affix to use for type inclusion. Defaults to "none". @@ -41,22 +43,38 @@ def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings Returns: The path to the import file. """ - - assert(type(data_mappings) == dict) # data_file => mapping_file - nodes = [] edges = [] - for data_file, mapping_file in data_mappings.items(): - table = pd.read_csv(data_file) + if data_mappings: - with open(mapping_file) as fd: - mapping = yaml.full_load(fd) + assert(type(data_mappings) == dict) # data_file => mapping_file - adapter = tabular.extract_all(table, mapping, parallel_mapping = parallel_mapping, affix = affix, separator = affix_separator) + for data_file, mapping_file in data_mappings.items(): + table = pd.read_csv(data_file) - nodes += adapter.nodes - edges += adapter.edges + with open(mapping_file) as fd: + mapping = yaml.full_load(fd) + + adapter = tabular.extract_table(table, mapping, parallel_mapping=parallel_mapping, affix=affix, + separator=affix_separator) + + nodes += adapter.nodes + edges += adapter.edges + + if loaded_data_mappings: + + assert(type(loaded_data_mappings) == dict) # data_frame => mapping_file + + for data_frame, mapping_file in loaded_data_mappings.items(): + with open(mapping_file) as fd: + mapping = yaml.full_load(fd) + + adapter = tabular.extract_table(data_frame, mapping, parallel_mapping=parallel_mapping, affix=affix, + separator=affix_separator) + + nodes += adapter.nodes + edges += adapter.edges fnodes, fedges = fusion.reconciliate(nodes, edges, separator = separator) @@ -74,14 +92,14 @@ def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings return import_file -def extract(data_mappings: dict, parallel_mapping = 0, affix="none", separator=":") -> Tuple[list[Tuple], list[Tuple]]: +def extract(data_mappings = None, loaded_data_mappings = None, parallel_mapping = 0, affix="none", affix_separator=":") -> Tuple[list[Tuple], list[Tuple]]: """ Extracts nodes and edges from tabular data files based on provided mappings. Args: - data_mappings (dict): a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them + data_mappings (dict): a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them. + loaded_data_mappings (dict): a dictionary mapping loaded Pandas data frame to the mapping yaml file. parallel_mapping (int): Number of workers to use in parallel mapping. Defaults to 0 for sequential processing. - separator (str, optional): The separator to use for splitting ID and type. Defaults to None. affix (str, optional): The affix to use for type inclusion. Defaults to "none". affix_separator: The character(s) separating the label from its type affix. Defaults to ":". @@ -89,21 +107,38 @@ def extract(data_mappings: dict, parallel_mapping = 0, affix="none", separator=" tuple: Two lists of tuples containing nodes and edges. """ - assert(type(data_mappings) == dict) # data_file => mapping_file - nodes = [] edges = [] - for data_file, mapping_file in data_mappings.items(): - table = pd.read_csv(data_file, sep = None) + if data_mappings: - with open(mapping_file) as fd: - mapping = yaml.full_load(fd) + assert(type(data_mappings) == dict) # data_file => mapping_file - adapter = tabular.extract_all(table, mapping, parallel_mapping=parallel_mapping, affix=affix, separator=separator) + for data_file, mapping_file in data_mappings.items(): + table = pd.read_csv(data_file) - nodes += adapter.nodes - edges += adapter.edges + with open(mapping_file) as fd: + mapping = yaml.full_load(fd) + + adapter = tabular.extract_table(table, mapping, parallel_mapping=parallel_mapping, affix=affix, + separator=affix_separator) + + nodes += adapter.nodes + edges += adapter.edges + + if loaded_data_mappings: + + assert(type(loaded_data_mappings) == dict) # data_frame => mapping_file + + for data_frame, mapping_file in loaded_data_mappings.items(): + with open(mapping_file) as fd: + mapping = yaml.full_load(fd) + + adapter = tabular.extract_table(data_frame, mapping, parallel_mapping=parallel_mapping, affix=affix, + separator=affix_separator) + + nodes += adapter.nodes + edges += adapter.edges return nodes, edges diff --git a/src/ontoweaver/tabular.py b/src/ontoweaver/tabular.py index c1d852e..ea1aca6 100644 --- a/src/ontoweaver/tabular.py +++ b/src/ontoweaver/tabular.py @@ -394,7 +394,7 @@ class PandasAdapter(base.Adapter): f"Performed {nb_transformations} transformations with {len(self.transformers)} transformers, producing {nb_nodes} nodes for {nb_rows} rows.") -def extract_all(df: pd.DataFrame, config: dict, parallel_mapping = 0, module = types, affix = "suffix", separator = ":"): +def extract_table(df: pd.DataFrame, config: dict, parallel_mapping = 0, module = types, affix = "suffix", separator = ":"): """ Proxy function for extracting from a table all nodes, edges and properties that are defined in a PandasAdapter configuration. diff --git a/tests/test_2ontologies.py b/tests/test_2ontologies.py index c329041..a053a5e 100644 --- a/tests/test_2ontologies.py +++ b/tests/test_2ontologies.py @@ -18,7 +18,7 @@ def main(): with open("oim.yaml") as fd: mapping = yaml.full_load(fd) - adapter = ontoweaver.tabular.extract_all(table, mapping) + adapter = ontoweaver.tabular.extract_table(table, mapping) assert(adapter) assert(adapter.nodes) diff --git a/tests/test_affix_separator.py b/tests/test_affix_separator.py index 6e1b16d..55dc69f 100644 --- a/tests/test_affix_separator.py +++ b/tests/test_affix_separator.py @@ -29,7 +29,7 @@ def test_affix_separator(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="prefix", separator="___") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="prefix", separator="___") assert (adapter) diff --git a/tests/test_edges_between_columns.py b/tests/test_edges_between_columns.py index b0310c0..c33e09a 100644 --- a/tests/test_edges_between_columns.py +++ b/tests/test_edges_between_columns.py @@ -31,7 +31,7 @@ def test_edges_between_columns(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping) + adapter = ontoweaver.tabular.extract_table(table, mapping) assert (adapter) diff --git a/tests/test_multiple_databases.py b/tests/test_multiple_databases.py index da6bba0..c69c9dc 100644 --- a/tests/test_multiple_databases.py +++ b/tests/test_multiple_databases.py @@ -34,7 +34,7 @@ def test_multiple_databases(): mapping = yaml.full_load(fd) logging.debug("Run the adapter (CGI)...") - adapter_cgi = ontoweaver.tabular.extract_all(table, mapping) + adapter_cgi = ontoweaver.tabular.extract_table(table, mapping) assert (adapter_cgi) logging.debug("Add CGI nodes...") @@ -55,7 +55,7 @@ def test_multiple_databases(): mapping = yaml.full_load(fd) logging.debug("Run the adapter (OncoKB)...") - adapter_oncokb = ontoweaver.tabular.extract_all(table, mapping) + adapter_oncokb = ontoweaver.tabular.extract_table(table, mapping) assert (adapter_oncokb) time.sleep(1) # Sleep for 1 second to allow the previous csv outputs to be removed. Test otherwise fails because diff --git a/tests/test_oncokb.py b/tests/test_oncokb.py index 1d8bd2e..94e4303 100644 --- a/tests/test_oncokb.py +++ b/tests/test_oncokb.py @@ -31,7 +31,7 @@ def test_oncokb(): logging.debug("Run the adapter...") from tests.oncokb import types - adapter = ontoweaver.tabular.extract_all(table, mapping) + adapter = ontoweaver.tabular.extract_table(table, mapping) assert (adapter) diff --git a/tests/test_ontology_subtypes.py b/tests/test_ontology_subtypes.py index 03815a9..189129f 100644 --- a/tests/test_ontology_subtypes.py +++ b/tests/test_ontology_subtypes.py @@ -29,7 +29,7 @@ def test_ontology_subtypes(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping) + adapter = ontoweaver.tabular.extract_table(table, mapping) assert (adapter) diff --git a/tests/test_parallel_mapping.py b/tests/test_parallel_mapping.py index 9c36a45..55fa18a 100644 --- a/tests/test_parallel_mapping.py +++ b/tests/test_parallel_mapping.py @@ -27,7 +27,7 @@ def test_parallel_mapping(): mapping = yaml.full_load(fd) logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, parallel_mapping = 8) + adapter = ontoweaver.tabular.extract_table(table, mapping, parallel_mapping=8) assert (adapter) diff --git a/tests/test_preprocessing_ontology/test_preprocessing_ontology.py b/tests/test_preprocessing_ontology/test_preprocessing_ontology.py index 9dd6813..6f5bbaa 100644 --- a/tests/test_preprocessing_ontology/test_preprocessing_ontology.py +++ b/tests/test_preprocessing_ontology/test_preprocessing_ontology.py @@ -27,7 +27,7 @@ def main(): with open("mapping.yaml") as fd: mapping = yaml.full_load(fd) - adapter = ontoweaver.tabular.extract_all(table, mapping) + adapter = ontoweaver.tabular.extract_table(table, mapping) assert(adapter) assert(adapter.nodes) diff --git a/tests/test_properties_metadata.py b/tests/test_properties_metadata.py index 96715f2..1849cab 100644 --- a/tests/test_properties_metadata.py +++ b/tests/test_properties_metadata.py @@ -27,7 +27,7 @@ def test_simplest(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="suffix") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="suffix") assert (adapter) diff --git a/tests/test_replace.py b/tests/test_replace.py index 4e4e452..68ca215 100644 --- a/tests/test_replace.py +++ b/tests/test_replace.py @@ -31,7 +31,7 @@ def test_replace(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="prefix", separator="___") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="prefix", separator="___") time.sleep(1) # Sleep for 1 second to allow the previous csv outputs to be removed. Test otherwise fails because # the directory contains the BioCypher output of previous tests. diff --git a/tests/test_simplest.py b/tests/test_simplest.py index 9e08f16..fef1f30 100644 --- a/tests/test_simplest.py +++ b/tests/test_simplest.py @@ -27,7 +27,7 @@ def test_simplest(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none") assert (adapter) diff --git a/tests/test_singular-plural.py b/tests/test_singular-plural.py index 67cbec9..87f4186 100644 --- a/tests/test_singular-plural.py +++ b/tests/test_singular-plural.py @@ -63,10 +63,10 @@ P2,V2-2,S3""" logging.debug("Run the plural adapter...") - plural_adapter = ontoweaver.tabular.extract_all(table, plural_map, affix="none") + plural_adapter = ontoweaver.tabular.extract_table(table, plural_map, affix="none") logging.debug("Run the singular adapter...") - singular_adapter = ontoweaver.tabular.extract_all(table, singular_map, affix="none") + singular_adapter = ontoweaver.tabular.extract_table(table, singular_map, affix="none") assert(list(plural_adapter.nodes) == list(singular_adapter.nodes)) assert(list(plural_adapter.edges) == list(singular_adapter.edges)) diff --git a/tests/test_transformer-string.py b/tests/test_transformer-string.py index 1a39649..c47e254 100644 --- a/tests/test_transformer-string.py +++ b/tests/test_transformer-string.py @@ -53,7 +53,7 @@ P2,V2-2,S3""" map = yaml.safe_load(mapping) logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, map, affix="none") + adapter = ontoweaver.tabular.extract_table(table, map, affix="none") for node in adapter.nodes: assert(node[2]["something"] == "Whatever it is") diff --git a/tests/test_transformer_user.py b/tests/test_transformer_user.py index 734c659..355bd72 100644 --- a/tests/test_transformer_user.py +++ b/tests/test_transformer_user.py @@ -38,7 +38,7 @@ def test_transformer_user(): table = pd.read_csv(csv_file) logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none") if __name__ == "__main__": diff --git a/tests/test_translate.py b/tests/test_translate.py index 1ae05ce..75af938 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -32,7 +32,7 @@ def test_translate(): table = pd.read_csv(csv_file) logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none") assert(adapter) assert(adapter.nodes) diff --git a/tests/test_translate_file.py b/tests/test_translate_file.py index b5f9a4d..70d1002 100644 --- a/tests/test_translate_file.py +++ b/tests/test_translate_file.py @@ -32,7 +32,7 @@ def test_translate_file(): table = pd.read_csv(csv_file) logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none") assert(adapter) assert(adapter.nodes)

mbaric758 requested a review from jdreo December 19, 2024 10:19

mbaric758 added the enhancement Improvement over an existing feature label Dec 19, 2024

mbaric758 added 6 commits December 19, 2024 14:51

fix(init):Check if fnodes and fedge exist.

5490ec6

fix(docs):Change to Python3.13 for read the docs.

4496107

fix(docs):Change name of extract_all function.

4c0dd3b

fix(init):Remove unnecessary imports.

645b438

fix(docs):Update Read the docs files.

86715b9

mbaric758 force-pushed the fix-variable-parsing branch from 423f4b0 to 86715b9 Compare December 19, 2024 13:52

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Enable parsing of loaded Dataframes. Improve function naming. #82

Enable parsing of loaded Dataframes. Improve function naming. #82

mbaric758 commented Dec 19, 2024 •

edited

Loading

Enable parsing of loaded Dataframes. Improve function naming. #82

Are you sure you want to change the base?

Enable parsing of loaded Dataframes. Improve function naming. #82

Conversation

mbaric758 commented Dec 19, 2024 • edited Loading

mbaric758 commented Dec 19, 2024 •

edited

Loading