Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable parsing of loaded Dataframes. Improve function naming. #82

Open
wants to merge 6 commits into
base: main
Choose a base branch
from

Conversation

mbaric758
Copy link
Collaborator

@mbaric758 mbaric758 commented Dec 19, 2024

  • Add option to pass loaded data frame : mapping file dictionary to extraction functions.
  • Update naming of said functions to better reflect their usage and update README and docstring.
  • Update Read the Docs configuration to use Python13 and new function naming.
  • TODO: After merging, usages of extract and tabular.extract_all functions should be renamed in OncodashKB.

@mbaric758 mbaric758 requested a review from jdreo December 19, 2024 10:19
@mbaric758 mbaric758 added the enhancement Improvement over an existing feature label Dec 19, 2024
renaming of extract_all function.

diff --git a/src/ontoweaver/__init__.py b/src/ontoweaver/__init__.py
index a3cd4e8..9069eeb 100644
--- a/src/ontoweaver/__init__.py
+++ b/src/ontoweaver/__init__.py
@@ -1,8 +1,9 @@
-from typing import Tuple
+from typing import Tuple, Optional

 import biocypher
 import yaml
 import pandas as pd
+from networkx.classes import nodes

 from . import base
 Node = base.Node
@@ -23,16 +24,17 @@ from . import fusion
 __all__ = ['Node', 'Edge', 'Transformer', 'Adapter', 'All', 'tabular', 'types', 'transformer', 'serialize', 'congregate', 'merge', 'fuse', 'fusion']

-def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings, parallel_mapping = 0, separator = None, affix = "none", affix_separator = ":"):
-    """Calls several mappings, each on the related Pandas-redable tabular data file,
+def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings = None, loaded_data_mappings = None, parallel_mapping = 0, separator = None, affix = "none", affix_separator = ":"):
+    """Calls several mappings, each on the related Pandas-readable tabular data file,
        then reconciliate duplicated nodes and edges (on nodes' IDs, merging properties in lists),
        then export everything with BioCypher.
        Returns the path to the resulting import file.

        Args:
-           biocypher_config_path: the BioCypher configuration file
-           schema_path: the assembling schema file
-           data_mappings: a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them
+           biocypher_config_path: the BioCypher configuration file.
+           schema_path: the assembling schema file.
+           data_mappings: a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them.
+           loaded_data_mappings: a dictionary mapping loaded Pandas data frame to the  mapping yaml file.
            parallel_mapping (int): Number of workers to use in parallel mapping. Defaults to 0 for sequential processing.
            separator (str, optional): The separator to use for combining values in reconciliation. Defaults to None.
            affix (str, optional): The affix to use for type inclusion. Defaults to "none".
@@ -41,22 +43,38 @@ def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings
        Returns:
            The path to the import file.
    """
-
-    assert(type(data_mappings) == dict) # data_file => mapping_file
-
     nodes = []
     edges = []

-    for data_file, mapping_file in data_mappings.items():
-        table = pd.read_csv(data_file)
+    if data_mappings:

-        with open(mapping_file) as fd:
-            mapping = yaml.full_load(fd)
+        assert(type(data_mappings) == dict) # data_file => mapping_file

-        adapter = tabular.extract_all(table, mapping, parallel_mapping = parallel_mapping, affix = affix, separator = affix_separator)
+        for data_file, mapping_file in data_mappings.items():
+            table = pd.read_csv(data_file)

-        nodes += adapter.nodes
-        edges += adapter.edges
+            with open(mapping_file) as fd:
+                mapping = yaml.full_load(fd)
+
+            adapter = tabular.extract_table(table, mapping, parallel_mapping=parallel_mapping, affix=affix,
+                                            separator=affix_separator)
+
+            nodes += adapter.nodes
+            edges += adapter.edges
+
+    if loaded_data_mappings:
+
+        assert(type(loaded_data_mappings) == dict) # data_frame => mapping_file
+
+        for data_frame, mapping_file in loaded_data_mappings.items():
+            with open(mapping_file) as fd:
+                mapping = yaml.full_load(fd)
+
+            adapter = tabular.extract_table(data_frame, mapping, parallel_mapping=parallel_mapping, affix=affix,
+                                            separator=affix_separator)
+
+            nodes += adapter.nodes
+            edges += adapter.edges

     fnodes, fedges = fusion.reconciliate(nodes, edges, separator = separator)

@@ -74,14 +92,14 @@ def extract_reconciliate_write(biocypher_config_path, schema_path, data_mappings
     return import_file

-def extract(data_mappings: dict, parallel_mapping = 0, affix="none", separator=":") -> Tuple[list[Tuple], list[Tuple]]:
+def extract(data_mappings = None, loaded_data_mappings = None, parallel_mapping = 0, affix="none", affix_separator=":") -> Tuple[list[Tuple], list[Tuple]]:
     """
     Extracts nodes and edges from tabular data files based on provided mappings.

     Args:
-        data_mappings (dict): a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them
+        data_mappings (dict): a dictionary mapping data file path to the OntoWeaver mapping yaml file to extract them.
+        loaded_data_mappings (dict): a dictionary mapping loaded Pandas data frame to the  mapping yaml file.
         parallel_mapping (int): Number of workers to use in parallel mapping. Defaults to 0 for sequential processing.
-        separator (str, optional): The separator to use for splitting ID and type. Defaults to None.
         affix (str, optional): The affix to use for type inclusion. Defaults to "none".
         affix_separator: The character(s) separating the label from its type affix. Defaults to ":".

@@ -89,21 +107,38 @@ def extract(data_mappings: dict, parallel_mapping = 0, affix="none", separator="
         tuple: Two lists of tuples containing nodes and edges.
     """

-    assert(type(data_mappings) == dict) # data_file => mapping_file
-
     nodes = []
     edges = []

-    for data_file, mapping_file in data_mappings.items():
-        table = pd.read_csv(data_file, sep = None)
+    if data_mappings:

-        with open(mapping_file) as fd:
-            mapping = yaml.full_load(fd)
+        assert(type(data_mappings) == dict) # data_file => mapping_file

-        adapter = tabular.extract_all(table, mapping, parallel_mapping=parallel_mapping, affix=affix, separator=separator)
+        for data_file, mapping_file in data_mappings.items():
+            table = pd.read_csv(data_file)

-        nodes += adapter.nodes
-        edges += adapter.edges
+            with open(mapping_file) as fd:
+                mapping = yaml.full_load(fd)
+
+            adapter = tabular.extract_table(table, mapping, parallel_mapping=parallel_mapping, affix=affix,
+                                            separator=affix_separator)
+
+            nodes += adapter.nodes
+            edges += adapter.edges
+
+    if loaded_data_mappings:
+
+        assert(type(loaded_data_mappings) == dict) # data_frame => mapping_file
+
+        for data_frame, mapping_file in loaded_data_mappings.items():
+            with open(mapping_file) as fd:
+                mapping = yaml.full_load(fd)
+
+            adapter = tabular.extract_table(data_frame, mapping, parallel_mapping=parallel_mapping, affix=affix,
+                                            separator=affix_separator)
+
+            nodes += adapter.nodes
+            edges += adapter.edges

     return nodes, edges

diff --git a/src/ontoweaver/tabular.py b/src/ontoweaver/tabular.py
index c1d852e..ea1aca6 100644
--- a/src/ontoweaver/tabular.py
+++ b/src/ontoweaver/tabular.py
@@ -394,7 +394,7 @@ class PandasAdapter(base.Adapter):
                 f"Performed {nb_transformations} transformations with {len(self.transformers)} transformers, producing {nb_nodes} nodes for {nb_rows} rows.")

-def extract_all(df: pd.DataFrame, config: dict, parallel_mapping = 0, module = types, affix = "suffix", separator = ":"):
+def extract_table(df: pd.DataFrame, config: dict, parallel_mapping = 0, module = types, affix = "suffix", separator = ":"):
     """
     Proxy function for extracting from a table all nodes, edges and properties
     that are defined in a PandasAdapter configuration.
diff --git a/tests/test_2ontologies.py b/tests/test_2ontologies.py
index c329041..a053a5e 100644
--- a/tests/test_2ontologies.py
+++ b/tests/test_2ontologies.py
@@ -18,7 +18,7 @@ def main():
     with open("oim.yaml") as fd:
         mapping = yaml.full_load(fd)

-    adapter = ontoweaver.tabular.extract_all(table, mapping)
+    adapter = ontoweaver.tabular.extract_table(table, mapping)
     assert(adapter)

     assert(adapter.nodes)
diff --git a/tests/test_affix_separator.py b/tests/test_affix_separator.py
index 6e1b16d..55dc69f 100644
--- a/tests/test_affix_separator.py
+++ b/tests/test_affix_separator.py
@@ -29,7 +29,7 @@ def test_affix_separator():

     logging.debug("Run the adapter...")

-    adapter = ontoweaver.tabular.extract_all(table, mapping, affix="prefix", separator="___")
+    adapter = ontoweaver.tabular.extract_table(table, mapping, affix="prefix", separator="___")

     assert (adapter)

diff --git a/tests/test_edges_between_columns.py b/tests/test_edges_between_columns.py
index b0310c0..c33e09a 100644
--- a/tests/test_edges_between_columns.py
+++ b/tests/test_edges_between_columns.py
@@ -31,7 +31,7 @@ def test_edges_between_columns():

     logging.debug("Run the adapter...")

-    adapter = ontoweaver.tabular.extract_all(table, mapping)
+    adapter = ontoweaver.tabular.extract_table(table, mapping)

     assert (adapter)

diff --git a/tests/test_multiple_databases.py b/tests/test_multiple_databases.py
index da6bba0..c69c9dc 100644
--- a/tests/test_multiple_databases.py
+++ b/tests/test_multiple_databases.py
@@ -34,7 +34,7 @@ def test_multiple_databases():
         mapping = yaml.full_load(fd)

     logging.debug("Run the adapter (CGI)...")
-    adapter_cgi = ontoweaver.tabular.extract_all(table, mapping)
+    adapter_cgi = ontoweaver.tabular.extract_table(table, mapping)
     assert (adapter_cgi)

     logging.debug("Add CGI nodes...")
@@ -55,7 +55,7 @@ def test_multiple_databases():
         mapping = yaml.full_load(fd)

     logging.debug("Run the adapter (OncoKB)...")
-    adapter_oncokb = ontoweaver.tabular.extract_all(table, mapping)
+    adapter_oncokb = ontoweaver.tabular.extract_table(table, mapping)
     assert (adapter_oncokb)

     time.sleep(1) # Sleep for 1 second to allow the previous csv outputs to be removed. Test otherwise fails because
diff --git a/tests/test_oncokb.py b/tests/test_oncokb.py
index 1d8bd2e..94e4303 100644
--- a/tests/test_oncokb.py
+++ b/tests/test_oncokb.py
@@ -31,7 +31,7 @@ def test_oncokb():
     logging.debug("Run the adapter...")
     from tests.oncokb import types

-    adapter = ontoweaver.tabular.extract_all(table, mapping)
+    adapter = ontoweaver.tabular.extract_table(table, mapping)

     assert (adapter)

diff --git a/tests/test_ontology_subtypes.py b/tests/test_ontology_subtypes.py
index 03815a9..189129f 100644
--- a/tests/test_ontology_subtypes.py
+++ b/tests/test_ontology_subtypes.py
@@ -29,7 +29,7 @@ def test_ontology_subtypes():

     logging.debug("Run the adapter...")

-    adapter = ontoweaver.tabular.extract_all(table, mapping)
+    adapter = ontoweaver.tabular.extract_table(table, mapping)

     assert (adapter)

diff --git a/tests/test_parallel_mapping.py b/tests/test_parallel_mapping.py
index 9c36a45..55fa18a 100644
--- a/tests/test_parallel_mapping.py
+++ b/tests/test_parallel_mapping.py
@@ -27,7 +27,7 @@ def test_parallel_mapping():
         mapping = yaml.full_load(fd)

     logging.debug("Run the adapter...")
-    adapter = ontoweaver.tabular.extract_all(table, mapping, parallel_mapping = 8)
+    adapter = ontoweaver.tabular.extract_table(table, mapping, parallel_mapping=8)

     assert (adapter)

diff --git a/tests/test_preprocessing_ontology/test_preprocessing_ontology.py b/tests/test_preprocessing_ontology/test_preprocessing_ontology.py
index 9dd6813..6f5bbaa 100644
--- a/tests/test_preprocessing_ontology/test_preprocessing_ontology.py
+++ b/tests/test_preprocessing_ontology/test_preprocessing_ontology.py
@@ -27,7 +27,7 @@ def main():
     with open("mapping.yaml") as fd:
         mapping = yaml.full_load(fd)

-    adapter = ontoweaver.tabular.extract_all(table, mapping)
+    adapter = ontoweaver.tabular.extract_table(table, mapping)
     assert(adapter)

     assert(adapter.nodes)
diff --git a/tests/test_properties_metadata.py b/tests/test_properties_metadata.py
index 96715f2..1849cab 100644
--- a/tests/test_properties_metadata.py
+++ b/tests/test_properties_metadata.py
@@ -27,7 +27,7 @@ def test_simplest():

     logging.debug("Run the adapter...")

-    adapter = ontoweaver.tabular.extract_all(table, mapping, affix="suffix")
+    adapter = ontoweaver.tabular.extract_table(table, mapping, affix="suffix")

     assert (adapter)

diff --git a/tests/test_replace.py b/tests/test_replace.py
index 4e4e452..68ca215 100644
--- a/tests/test_replace.py
+++ b/tests/test_replace.py
@@ -31,7 +31,7 @@ def test_replace():

     logging.debug("Run the adapter...")

-    adapter = ontoweaver.tabular.extract_all(table, mapping, affix="prefix", separator="___")
+    adapter = ontoweaver.tabular.extract_table(table, mapping, affix="prefix", separator="___")

     time.sleep(1) # Sleep for 1 second to allow the previous csv outputs to be removed. Test otherwise fails because
                   # the directory contains the BioCypher output of previous tests.
diff --git a/tests/test_simplest.py b/tests/test_simplest.py
index 9e08f16..fef1f30 100644
--- a/tests/test_simplest.py
+++ b/tests/test_simplest.py
@@ -27,7 +27,7 @@ def test_simplest():

     logging.debug("Run the adapter...")

-    adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none")
+    adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none")

     assert (adapter)

diff --git a/tests/test_singular-plural.py b/tests/test_singular-plural.py
index 67cbec9..87f4186 100644
--- a/tests/test_singular-plural.py
+++ b/tests/test_singular-plural.py
@@ -63,10 +63,10 @@ P2,V2-2,S3"""

     logging.debug("Run the plural adapter...")
-    plural_adapter = ontoweaver.tabular.extract_all(table, plural_map, affix="none")
+    plural_adapter = ontoweaver.tabular.extract_table(table, plural_map, affix="none")

     logging.debug("Run the singular adapter...")
-    singular_adapter = ontoweaver.tabular.extract_all(table, singular_map, affix="none")
+    singular_adapter = ontoweaver.tabular.extract_table(table, singular_map, affix="none")

     assert(list(plural_adapter.nodes) == list(singular_adapter.nodes))
     assert(list(plural_adapter.edges) == list(singular_adapter.edges))
diff --git a/tests/test_transformer-string.py b/tests/test_transformer-string.py
index 1a39649..c47e254 100644
--- a/tests/test_transformer-string.py
+++ b/tests/test_transformer-string.py
@@ -53,7 +53,7 @@ P2,V2-2,S3"""
     map = yaml.safe_load(mapping)

     logging.debug("Run the adapter...")
-    adapter = ontoweaver.tabular.extract_all(table, map, affix="none")
+    adapter = ontoweaver.tabular.extract_table(table, map, affix="none")

     for node in adapter.nodes:
         assert(node[2]["something"] == "Whatever it is")
diff --git a/tests/test_transformer_user.py b/tests/test_transformer_user.py
index 734c659..355bd72 100644
--- a/tests/test_transformer_user.py
+++ b/tests/test_transformer_user.py
@@ -38,7 +38,7 @@ def test_transformer_user():
     table = pd.read_csv(csv_file)

     logging.debug("Run the adapter...")
-    adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none")
+    adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none")

 if __name__ == "__main__":
diff --git a/tests/test_translate.py b/tests/test_translate.py
index 1ae05ce..75af938 100644
--- a/tests/test_translate.py
+++ b/tests/test_translate.py
@@ -32,7 +32,7 @@ def test_translate():
     table = pd.read_csv(csv_file)

     logging.debug("Run the adapter...")
-    adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none")
+    adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none")

     assert(adapter)
     assert(adapter.nodes)
diff --git a/tests/test_translate_file.py b/tests/test_translate_file.py
index b5f9a4d..70d1002 100644
--- a/tests/test_translate_file.py
+++ b/tests/test_translate_file.py
@@ -32,7 +32,7 @@ def test_translate_file():
     table = pd.read_csv(csv_file)

     logging.debug("Run the adapter...")
-    adapter = ontoweaver.tabular.extract_all(table, mapping, affix="none")
+    adapter = ontoweaver.tabular.extract_table(table, mapping, affix="none")

     assert(adapter)
     assert(adapter.nodes)
@mbaric758 mbaric758 force-pushed the fix-variable-parsing branch from 423f4b0 to 86715b9 Compare December 19, 2024 13:52
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement Improvement over an existing feature
Projects
Status: No status
Development

Successfully merging this pull request may close these issues.

1 participant