-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enable parsing of loaded Dataframes. Improve function naming. #82
Open
mbaric758
wants to merge
6
commits into
oncodash:main
Choose a base branch
from
mbaric758:fix-variable-parsing
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
renaming of extract_all function. diff --git a/src/ontoweaver/__init__.py b/src/ontoweaver/__init__.py index a3cd4e8..9069eeb 100644 --- a/src/ontoweaver/__init__.py +++ b/src/ontoweaver/__init__.py @@ -1,8 +1,9 @@ -from typing import Tuple +from typing import Tuple, Optional import biocypher import yaml import pandas as pd +from networkx.classes import nodes from . import base Node = base.Node @@ -23,16 +24,17 @@ from . import fusion __all__ = ['Node', 'Edge', 'Transformer', 'Adapter', 'All', 'tabular', 'types', 'transformer', 'serialize', 'congregate', 'merge', 'fuse', 'fusion'] -def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings, parallel_mapping = 0, separator = None, affix = "none", affix_separator = ":"): - """Calls several mappings, each on the related Pandas-redable tabular data file, +def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings = None, loaded_data_mappings = None, parallel_mapping = 0, separator = None, affix = "none", affix_separator = ":"): + """Calls several mappings, each on the related Pandas-readable tabular data file, then reconciliate duplicated nodes and edges (on nodes' IDs, merging properties in lists), then export everything with BioCypher. Returns the path to the resulting import file. Args: - biocypher_config_path: the BioCypher configuration file - schema_path: the assembling schema file - data_mappings: a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them + biocypher_config_path: the BioCypher configuration file. + schema_path: the assembling schema file. + data_mappings: a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them. + loaded_data_mappings: a dictionary mapping loaded Pandas data frame to the mapping yaml file. parallel_mapping (int): Number of workers to use in parallel mapping. Defaults to 0 for sequential processing. separator (str, optional): The separator to use for combining values in reconciliation. Defaults to None. affix (str, optional): The affix to use for type inclusion. Defaults to "none". @@ -41,22 +43,38 @@ def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings Returns: The path to the import file. """ - - assert(type(data_mappings) == dict) # data_file => mapping_file - nodes = [] edges = [] - for data_file, mapping_file in data_mappings.items(): - table = pd.read_csv(data_file) + if data_mappings: - with open(mapping_file) as fd: - mapping = yaml.full_load(fd) + assert(type(data_mappings) == dict) # data_file => mapping_file - adapter = tabular.extract_all(table, mapping, parallel_mapping = parallel_mapping, affix = affix, separator = affix_separator) + for data_file, mapping_file in data_mappings.items(): + table = pd.read_csv(data_file) - nodes += adapter.nodes - edges += adapter.edges + with open(mapping_file) as fd: + mapping = yaml.full_load(fd) + + adapter = tabular.extract_table(table, mapping, parallel_mapping=parallel_mapping, affix=affix, + separator=affix_separator) + + nodes += adapter.nodes + edges += adapter.edges + + if loaded_data_mappings: + + assert(type(loaded_data_mappings) == dict) # data_frame => mapping_file + + for data_frame, mapping_file in loaded_data_mappings.items(): + with open(mapping_file) as fd: + mapping = yaml.full_load(fd) + + adapter = tabular.extract_table(data_frame, mapping, parallel_mapping=parallel_mapping, affix=affix, + separator=affix_separator) + + nodes += adapter.nodes + edges += adapter.edges fnodes, fedges = fusion.reconciliate(nodes, edges, separator = separator) @@ -74,14 +92,14 @@ def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings return import_file -def extract(data_mappings: dict, parallel_mapping = 0, affix="none", separator=":") -> Tuple[list[Tuple], list[Tuple]]: +def extract(data_mappings = None, loaded_data_mappings = None, parallel_mapping = 0, affix="none", affix_separator=":") -> Tuple[list[Tuple], list[Tuple]]: """ Extracts nodes and edges from tabular data files based on provided mappings. Args: - data_mappings (dict): a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them + data_mappings (dict): a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them. + loaded_data_mappings (dict): a dictionary mapping loaded Pandas data frame to the mapping yaml file. parallel_mapping (int): Number of workers to use in parallel mapping. Defaults to 0 for sequential processing. - separator (str, optional): The separator to use for splitting ID and type. Defaults to None. affix (str, optional): The affix to use for type inclusion. Defaults to "none". affix_separator: The character(s) separating the label from its type affix. Defaults to ":". @@ -89,21 +107,38 @@ def extract(data_mappings: dict, parallel_mapping = 0, affix="none", separator=" tuple: Two lists of tuples containing nodes and edges. """ - assert(type(data_mappings) == dict) # data_file => mapping_file - nodes = [] edges = [] - for data_file, mapping_file in data_mappings.items(): - table = pd.read_csv(data_file, sep = None) + if data_mappings: - with open(mapping_file) as fd: - mapping = yaml.full_load(fd) + assert(type(data_mappings) == dict) # data_file => mapping_file - adapter = tabular.extract_all(table, mapping, parallel_mapping=parallel_mapping, affix=affix, separator=separator) + for data_file, mapping_file in data_mappings.items(): + table = pd.read_csv(data_file) - nodes += adapter.nodes - edges += adapter.edges + with open(mapping_file) as fd: + mapping = yaml.full_load(fd) + + adapter = tabular.extract_table(table, mapping, parallel_mapping=parallel_mapping, affix=affix, + separator=affix_separator) + + nodes += adapter.nodes + edges += adapter.edges + + if loaded_data_mappings: + + assert(type(loaded_data_mappings) == dict) # data_frame => mapping_file + + for data_frame, mapping_file in loaded_data_mappings.items(): + with open(mapping_file) as fd: + mapping = yaml.full_load(fd) + + adapter = tabular.extract_table(data_frame, mapping, parallel_mapping=parallel_mapping, affix=affix, + separator=affix_separator) + + nodes += adapter.nodes + edges += adapter.edges return nodes, edges diff --git a/src/ontoweaver/tabular.py b/src/ontoweaver/tabular.py index c1d852e..ea1aca6 100644 --- a/src/ontoweaver/tabular.py +++ b/src/ontoweaver/tabular.py @@ -394,7 +394,7 @@ class PandasAdapter(base.Adapter): f"Performed {nb_transformations} transformations with {len(self.transformers)} transformers, producing {nb_nodes} nodes for {nb_rows} rows.") -def extract_all(df: pd.DataFrame, config: dict, parallel_mapping = 0, module = types, affix = "suffix", separator = ":"): +def extract_table(df: pd.DataFrame, config: dict, parallel_mapping = 0, module = types, affix = "suffix", separator = ":"): """ Proxy function for extracting from a table all nodes, edges and properties that are defined in a PandasAdapter configuration. diff --git a/tests/test_2ontologies.py b/tests/test_2ontologies.py index c329041..a053a5e 100644 --- a/tests/test_2ontologies.py +++ b/tests/test_2ontologies.py @@ -18,7 +18,7 @@ def main(): with open("oim.yaml") as fd: mapping = yaml.full_load(fd) - adapter = ontoweaver.tabular.extract_all(table, mapping) + adapter = ontoweaver.tabular.extract_table(table, mapping) assert(adapter) assert(adapter.nodes) diff --git a/tests/test_affix_separator.py b/tests/test_affix_separator.py index 6e1b16d..55dc69f 100644 --- a/tests/test_affix_separator.py +++ b/tests/test_affix_separator.py @@ -29,7 +29,7 @@ def test_affix_separator(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="prefix", separator="___") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="prefix", separator="___") assert (adapter) diff --git a/tests/test_edges_between_columns.py b/tests/test_edges_between_columns.py index b0310c0..c33e09a 100644 --- a/tests/test_edges_between_columns.py +++ b/tests/test_edges_between_columns.py @@ -31,7 +31,7 @@ def test_edges_between_columns(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping) + adapter = ontoweaver.tabular.extract_table(table, mapping) assert (adapter) diff --git a/tests/test_multiple_databases.py b/tests/test_multiple_databases.py index da6bba0..c69c9dc 100644 --- a/tests/test_multiple_databases.py +++ b/tests/test_multiple_databases.py @@ -34,7 +34,7 @@ def test_multiple_databases(): mapping = yaml.full_load(fd) logging.debug("Run the adapter (CGI)...") - adapter_cgi = ontoweaver.tabular.extract_all(table, mapping) + adapter_cgi = ontoweaver.tabular.extract_table(table, mapping) assert (adapter_cgi) logging.debug("Add CGI nodes...") @@ -55,7 +55,7 @@ def test_multiple_databases(): mapping = yaml.full_load(fd) logging.debug("Run the adapter (OncoKB)...") - adapter_oncokb = ontoweaver.tabular.extract_all(table, mapping) + adapter_oncokb = ontoweaver.tabular.extract_table(table, mapping) assert (adapter_oncokb) time.sleep(1) # Sleep for 1 second to allow the previous csv outputs to be removed. Test otherwise fails because diff --git a/tests/test_oncokb.py b/tests/test_oncokb.py index 1d8bd2e..94e4303 100644 --- a/tests/test_oncokb.py +++ b/tests/test_oncokb.py @@ -31,7 +31,7 @@ def test_oncokb(): logging.debug("Run the adapter...") from tests.oncokb import types - adapter = ontoweaver.tabular.extract_all(table, mapping) + adapter = ontoweaver.tabular.extract_table(table, mapping) assert (adapter) diff --git a/tests/test_ontology_subtypes.py b/tests/test_ontology_subtypes.py index 03815a9..189129f 100644 --- a/tests/test_ontology_subtypes.py +++ b/tests/test_ontology_subtypes.py @@ -29,7 +29,7 @@ def test_ontology_subtypes(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping) + adapter = ontoweaver.tabular.extract_table(table, mapping) assert (adapter) diff --git a/tests/test_parallel_mapping.py b/tests/test_parallel_mapping.py index 9c36a45..55fa18a 100644 --- a/tests/test_parallel_mapping.py +++ b/tests/test_parallel_mapping.py @@ -27,7 +27,7 @@ def test_parallel_mapping(): mapping = yaml.full_load(fd) logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, parallel_mapping = 8) + adapter = ontoweaver.tabular.extract_table(table, mapping, parallel_mapping=8) assert (adapter) diff --git a/tests/test_preprocessing_ontology/test_preprocessing_ontology.py b/tests/test_preprocessing_ontology/test_preprocessing_ontology.py index 9dd6813..6f5bbaa 100644 --- a/tests/test_preprocessing_ontology/test_preprocessing_ontology.py +++ b/tests/test_preprocessing_ontology/test_preprocessing_ontology.py @@ -27,7 +27,7 @@ def main(): with open("mapping.yaml") as fd: mapping = yaml.full_load(fd) - adapter = ontoweaver.tabular.extract_all(table, mapping) + adapter = ontoweaver.tabular.extract_table(table, mapping) assert(adapter) assert(adapter.nodes) diff --git a/tests/test_properties_metadata.py b/tests/test_properties_metadata.py index 96715f2..1849cab 100644 --- a/tests/test_properties_metadata.py +++ b/tests/test_properties_metadata.py @@ -27,7 +27,7 @@ def test_simplest(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="suffix") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="suffix") assert (adapter) diff --git a/tests/test_replace.py b/tests/test_replace.py index 4e4e452..68ca215 100644 --- a/tests/test_replace.py +++ b/tests/test_replace.py @@ -31,7 +31,7 @@ def test_replace(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="prefix", separator="___") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="prefix", separator="___") time.sleep(1) # Sleep for 1 second to allow the previous csv outputs to be removed. Test otherwise fails because # the directory contains the BioCypher output of previous tests. diff --git a/tests/test_simplest.py b/tests/test_simplest.py index 9e08f16..fef1f30 100644 --- a/tests/test_simplest.py +++ b/tests/test_simplest.py @@ -27,7 +27,7 @@ def test_simplest(): logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none") assert (adapter) diff --git a/tests/test_singular-plural.py b/tests/test_singular-plural.py index 67cbec9..87f4186 100644 --- a/tests/test_singular-plural.py +++ b/tests/test_singular-plural.py @@ -63,10 +63,10 @@ P2,V2-2,S3""" logging.debug("Run the plural adapter...") - plural_adapter = ontoweaver.tabular.extract_all(table, plural_map, affix="none") + plural_adapter = ontoweaver.tabular.extract_table(table, plural_map, affix="none") logging.debug("Run the singular adapter...") - singular_adapter = ontoweaver.tabular.extract_all(table, singular_map, affix="none") + singular_adapter = ontoweaver.tabular.extract_table(table, singular_map, affix="none") assert(list(plural_adapter.nodes) == list(singular_adapter.nodes)) assert(list(plural_adapter.edges) == list(singular_adapter.edges)) diff --git a/tests/test_transformer-string.py b/tests/test_transformer-string.py index 1a39649..c47e254 100644 --- a/tests/test_transformer-string.py +++ b/tests/test_transformer-string.py @@ -53,7 +53,7 @@ P2,V2-2,S3""" map = yaml.safe_load(mapping) logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, map, affix="none") + adapter = ontoweaver.tabular.extract_table(table, map, affix="none") for node in adapter.nodes: assert(node[2]["something"] == "Whatever it is") diff --git a/tests/test_transformer_user.py b/tests/test_transformer_user.py index 734c659..355bd72 100644 --- a/tests/test_transformer_user.py +++ b/tests/test_transformer_user.py @@ -38,7 +38,7 @@ def test_transformer_user(): table = pd.read_csv(csv_file) logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none") if __name__ == "__main__": diff --git a/tests/test_translate.py b/tests/test_translate.py index 1ae05ce..75af938 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -32,7 +32,7 @@ def test_translate(): table = pd.read_csv(csv_file) logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none") assert(adapter) assert(adapter.nodes) diff --git a/tests/test_translate_file.py b/tests/test_translate_file.py index b5f9a4d..70d1002 100644 --- a/tests/test_translate_file.py +++ b/tests/test_translate_file.py @@ -32,7 +32,7 @@ def test_translate_file(): table = pd.read_csv(csv_file) logging.debug("Run the adapter...") - adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none") + adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none") assert(adapter) assert(adapter.nodes)
mbaric758
force-pushed
the
fix-variable-parsing
branch
from
December 19, 2024 13:52
423f4b0
to
86715b9
Compare
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
loaded data frame : mapping file
dictionary to extraction functions.extract
andtabular.extract_all
functions should be renamed inOncodashKB
.