PNNL-CompBio · ymahlich · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 27, 2024
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,5 @@ tests/__pycache__
 dist
 build/lib
 build/local
+
+coderdata/_version.py
diff --git a/coderdata/__init__.py b/coderdata/__init__.py
@@ -6,16 +6,9 @@
     train_test_validate
 )
 
-# '_version.py' will be generated by hatchling once the switch away from
-# setuptools.py is finished
-try:
-    from ._version import __version__
-except ImportError:
-    __version__ = '0.1.40'
-try:
-    from ._version import __version_tuple__
-except ImportError:
-    __version_tuple__ = (0, 1, 40)
+from ._version import __version__
+from ._version import __version_tuple__
+
 
 from .utils.utils import version
 from .utils.utils import list_datasets
diff --git a/coderdata/builder/__init__.py b/coderdata/builder/__init__.py
diff --git a/coderdata/datasets.yml → coderdata/dataset.yml b/coderdata/datasets.yml → coderdata/dataset.yml
@@ -1,3 +1,5 @@
+figshare: "https://api.figshare.com/v2/articles/26409316"
+version: "v0.1.4"
 datasets:
   beataml:
     description: "Beat acute myeloid leukemia (BeatAML) focuses on acute myeloid leukemia tumor data. Data includes drug response, proteomics, and transcriptomics datasets."
@@ -7,4 +9,4 @@ datasets:
   hcmi:
     description: "Human Cancer Models Initiative (HCMI) encompasses numerous cancer types and includes cell line, organoid, and tumor data. Data includes the transcriptomics, somatic mutation, and copy number datasets."
   mpnst:
-    description: "Malignant Peripheral Nerve Sheath Tumor is a rare, agressive sarcoma that affects peripheral nerves throughout the body."
+    description: "Malignant Peripheral Nerve Sheath Tumor is a rare, agressive sarcoma that affects peripheral nerves throughout the body."
diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py
@@ -11,6 +11,8 @@
 import pickle
 import sys
 from typing import Literal
+from typing import Optional
+from typing import Union
 
 import numpy as np
 from numpy.random import RandomState
@@ -335,8 +337,8 @@ def train_test_validate(
             'mixed-set', 'drug-blind', 'cancer-blind'
             ]='mixed-set',
         ratio: tuple[int, int, int]=(8,1,1),
-        stratify_by: (str | None)=None,
-        random_state: (int | RandomState | None)=None,
+        stratify_by: Optional[str]=None,
+        random_state: Optional[Union[int,RandomState]]=None,
         **kwargs: dict,
         ) -> Split:
 
@@ -386,7 +388,7 @@ def save(self, path: Path) -> None:
 
 def load(
         name: str,
-        directory: str|Path=Path.cwd(),
+        local_path: Union[str,Path]=Path.cwd(),
         from_pickle:bool=False
         ) -> Dataset:
     """
@@ -411,50 +413,62 @@ def load(
     TypeError
         _description_
     """
-    print("Processing Data...", file=sys.stderr)
 
-    if type(directory) is not Path:
+    if type(local_path) is not Path:
         try:
-            directory = Path(directory)
-            if not directory.exists():
+            local_path = Path(local_path)
+            if not local_path.exists():
                 raise OSError(
-                    f"Given path / directory does not exist: '{directory}'"
+                    f"Given path / directory does not exist: '{local_path}'"
                 )
         except TypeError:
             raise TypeError(
-                f"Invalid path / directory defined: '{directory}'"
+                f"Invalid path / directory defined: '{local_path}'"
             )
 
 
     if not from_pickle:
         dataset = Dataset(name)
         accepted_file_endings = ('.csv', '.tsv', '.csv.gz', '.tsv.gz')
-        for child in directory.iterdir():
+        print(f"Importing raw data ...", file=sys.stderr)
+        for child in local_path.iterdir():
             if child.name in ["genes.csv", "genes.csv.gz"]:
+                print(
+                    f"Importing 'genes' from {child} ...",
+                    end=' ',
+                    file=sys.stderr
+                    )
                 dataset.genes = _load_file(child)
-                print("Loaded genes dataset.", file=sys.stderr)
+                print("DONE", file=sys.stderr)
 
             if (
                 child.name.startswith(name)
                 and child.name.endswith(accepted_file_endings)
                 ):
 
                 dataset_type = child.name[len(name)+1:].split('.')[0]
-                print(dataset_type)
+                print(
+                    f"Importing '{dataset_type}' from {child} ...",
+                    end=' ',
+                    file=sys.stderr
+                    )
                 if hasattr(dataset, dataset_type):
                     setattr(dataset, dataset_type, _load_file(child))
-
+                    print("DONE", file=sys.stderr)
+        print(f"Importing raw data ... DONE", file=sys.stderr)
         return dataset
 
     else:
         accepted_file_endings = ('.pkl', '.pickle')
-        for child in directory.iterdir():
+        for child in local_path.iterdir():
             if (
                 child.name.startswith(name)
                 and child.name.endswith(accepted_file_endings)
                 ):
+                print(f"Importing pickled data ...", end=' ', file=sys.stderr)
                 with open(child, 'rb') as file:
                     dataset = pickle.load(file=file)
+                print("DONE", file=sys.stderr)
                 return dataset
 
 
@@ -657,8 +671,8 @@ def train_test_validate(
             'mixed-set', 'drug-blind', 'cancer-blind'
             ]='mixed-set',
         ratio: tuple[int, int, int]=(8,1,1),
-        stratify_by: (str | None)=None,
-        random_state: (int | RandomState | None)=None,
+        stratify_by: Optional[str]=None,
+        random_state: Optional[Union[int,RandomState]]=None,
         **kwargs: dict,
         ) -> Split:
     """
@@ -1003,8 +1017,7 @@ def _load_file(file_path: Path) -> pd.DataFrame:
             )
 
 
-def _determine_delimiter(file_path):
-    print(file_path.suffixes)
+def _determine_delimiter(file_path: Path) -> str:
     if '.tsv' in file_path.suffixes:
         return '\t'
     else:

diff --git a/coderdata/download/downloader.py b/coderdata/download/downloader.py
@@ -1,11 +1,14 @@
 # coderdata/download/downloader.py
 
+from importlib import resources
 from pathlib import Path
 from os import PathLike
 import os
 import requests
 import warnings
 
+import yaml
+
 def download(
         name: str=None,
         local_path: PathLike=Path.cwd(),
@@ -44,7 +47,9 @@ def download(
     if not local_path.exists():
         Path.mkdir(local_path)
     # Get the dataset details
-    url = "https://api.figshare.com/v2/articles/25033697"
+    with resources.open_text('coderdata', 'dataset.yml') as f:
+        data_information = yaml.load(f, Loader=yaml.FullLoader)
+    url = data_information['figshare']
 
     response = requests.get(url)
     if response.status_code != 200:

diff --git a/coderdata/download/figshare_latest.yml b/coderdata/download/figshare_latest.yml
diff --git a/coderdata/utils/utils.py b/coderdata/utils/utils.py
@@ -5,6 +5,8 @@
 from importlib import resources
 import yaml
 
+from typing import Union
+
 from .. import __version__
 from .. import __version_tuple__
 
@@ -19,13 +21,18 @@ def version() -> dict:
     dict
         Contains package and dataset build version.
     """
+    with resources.open_text('coderdata', 'dataset.yml') as f:
+        data_information = yaml.load(f, Loader=yaml.FullLoader)
     return {
         'package' : __version__,
-        'dataset' : f"{__version_tuple__[0]}.{__version_tuple__[1]}"
+        # getting the dataset version from 'dataset.yml'
+        'dataset' : data_information['version'],
+        # exprapolating the dataset version from the api version number
+        # 'dataset' : f"{__version_tuple__[0]}.{__version_tuple__[1]}"
         }
 
 
-def list_datasets(raw: bool=False) -> dict | None:
+def list_datasets(raw: bool=False) -> Union[dict, None]:
     """
     Hepler function that returns a list of available datasets including 
     a short description and additional information available.
@@ -43,11 +50,11 @@ def list_datasets(raw: bool=False) -> dict | None:
         Returns a dict containing the information if ``raw==True``,
         otherwise prints information to stdout and returns `None`.
     """
-    with resources.open_text('coderdata', 'datasets.yml') as f:
-        datasets = yaml.load(f, Loader=yaml.FullLoader)
+    with resources.open_text('coderdata', 'dataset.yml') as f:
+        data_information = yaml.load(f, Loader=yaml.FullLoader)
     if raw:
-        return datasets
+        return data_information['datasets']
     else:
-        datasets = datasets['datasets']
+        datasets = data_information['datasets']
         for dataset in datasets:
-            print(f'{dataset}: "{datasets[dataset]['description']}"')
+            print(f'{dataset}: {datasets[dataset]["description"]}')
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,61 @@
+[build-system]
+requires = [
+    "hatchling",
+    "hatch-vcs",
+    ]
+build-backend = "hatchling.build"
+
+[project]
+name = "coderdata"
+description = "A package to download, load, and process multiple benchmark multi-omic drug response datasets"
+
+requires-python = ">=3.9"
+authors = [
+    { name = "Jeremy Jacobson", email = "[email protected]" },
+    { name = "Yannick Mahlich", email = "[email protected]" },
+    { name = "Sara Gosline", email = "[email protected]"}
+]
+classifiers = [
+    "License :: OSI Approved :: BSD License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+dependencies = [
+    "numpy",
+    "pandas",
+    "requests",
+    "scikit-learn",
+    "pyyaml",
+]
+dynamic = [
+    "version",
+    ]
+readme = "README.md"
+license = {text = "2-clause BSD"}
+
+[project.scripts]
+coderdata = "coderdata.cli:main"
+
+[project.urls]
+Homepage = "https://github.com/PNNL-CompBio/candleDataProcessing"
+Documentation = "https://pnnl-compbio.github.io/coderdata/"
+Repository = "https://github.com/PNNL-CompBio/coderdata.git"
+Issues = "https://github.com/PNNL-CompBio/coderdata/issues"
+
+[tool.hatch.version]
+source = "vcs"
+
+[tool.hatch.build.hooks.vcs]
+version-file = "coderdata/_version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "/coderdata",
+]
diff --git a/setup.py b/setup.py