Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

211 missing pyprojecttoml #273

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@ tests/__pycache__
dist
build/lib
build/local

coderdata/_version.py
13 changes: 3 additions & 10 deletions coderdata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,9 @@
train_test_validate
)

# '_version.py' will be generated by hatchling once the switch away from
# setuptools.py is finished
try:
from ._version import __version__
except ImportError:
__version__ = '0.1.40'
try:
from ._version import __version_tuple__
except ImportError:
__version_tuple__ = (0, 1, 40)
from ._version import __version__
from ._version import __version_tuple__


from .utils.utils import version
from .utils.utils import list_datasets
2 changes: 0 additions & 2 deletions coderdata/builder/__init__.py

This file was deleted.

4 changes: 3 additions & 1 deletion coderdata/datasets.yml → coderdata/dataset.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
figshare: "https://api.figshare.com/v2/articles/26409316"
version: "v0.1.4"
datasets:
beataml:
description: "Beat acute myeloid leukemia (BeatAML) focuses on acute myeloid leukemia tumor data. Data includes drug response, proteomics, and transcriptomics datasets."
Expand All @@ -7,4 +9,4 @@ datasets:
hcmi:
description: "Human Cancer Models Initiative (HCMI) encompasses numerous cancer types and includes cell line, organoid, and tumor data. Data includes the transcriptomics, somatic mutation, and copy number datasets."
mpnst:
description: "Malignant Peripheral Nerve Sheath Tumor is a rare, agressive sarcoma that affects peripheral nerves throughout the body."
description: "Malignant Peripheral Nerve Sheath Tumor is a rare, agressive sarcoma that affects peripheral nerves throughout the body."
49 changes: 31 additions & 18 deletions coderdata/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import pickle
import sys
from typing import Literal
from typing import Optional
from typing import Union

import numpy as np
from numpy.random import RandomState
Expand Down Expand Up @@ -335,8 +337,8 @@ def train_test_validate(
'mixed-set', 'drug-blind', 'cancer-blind'
]='mixed-set',
ratio: tuple[int, int, int]=(8,1,1),
stratify_by: (str | None)=None,
random_state: (int | RandomState | None)=None,
stratify_by: Optional[str]=None,
random_state: Optional[Union[int,RandomState]]=None,
**kwargs: dict,
) -> Split:

Expand Down Expand Up @@ -386,7 +388,7 @@ def save(self, path: Path) -> None:

def load(
name: str,
directory: str|Path=Path.cwd(),
local_path: Union[str,Path]=Path.cwd(),
from_pickle:bool=False
) -> Dataset:
"""
Expand All @@ -411,50 +413,62 @@ def load(
TypeError
_description_
"""
print("Processing Data...", file=sys.stderr)

if type(directory) is not Path:
if type(local_path) is not Path:
try:
directory = Path(directory)
if not directory.exists():
local_path = Path(local_path)
if not local_path.exists():
raise OSError(
f"Given path / directory does not exist: '{directory}'"
f"Given path / directory does not exist: '{local_path}'"
)
except TypeError:
raise TypeError(
f"Invalid path / directory defined: '{directory}'"
f"Invalid path / directory defined: '{local_path}'"
)


if not from_pickle:
dataset = Dataset(name)
accepted_file_endings = ('.csv', '.tsv', '.csv.gz', '.tsv.gz')
for child in directory.iterdir():
print(f"Importing raw data ...", file=sys.stderr)
for child in local_path.iterdir():
if child.name in ["genes.csv", "genes.csv.gz"]:
print(
f"Importing 'genes' from {child} ...",
end=' ',
file=sys.stderr
)
dataset.genes = _load_file(child)
print("Loaded genes dataset.", file=sys.stderr)
print("DONE", file=sys.stderr)

if (
child.name.startswith(name)
and child.name.endswith(accepted_file_endings)
):

dataset_type = child.name[len(name)+1:].split('.')[0]
print(dataset_type)
print(
f"Importing '{dataset_type}' from {child} ...",
end=' ',
file=sys.stderr
)
if hasattr(dataset, dataset_type):
setattr(dataset, dataset_type, _load_file(child))

print("DONE", file=sys.stderr)
print(f"Importing raw data ... DONE", file=sys.stderr)
return dataset

else:
accepted_file_endings = ('.pkl', '.pickle')
for child in directory.iterdir():
for child in local_path.iterdir():
if (
child.name.startswith(name)
and child.name.endswith(accepted_file_endings)
):
print(f"Importing pickled data ...", end=' ', file=sys.stderr)
with open(child, 'rb') as file:
dataset = pickle.load(file=file)
print("DONE", file=sys.stderr)
return dataset


Expand Down Expand Up @@ -657,8 +671,8 @@ def train_test_validate(
'mixed-set', 'drug-blind', 'cancer-blind'
]='mixed-set',
ratio: tuple[int, int, int]=(8,1,1),
stratify_by: (str | None)=None,
random_state: (int | RandomState | None)=None,
stratify_by: Optional[str]=None,
random_state: Optional[Union[int,RandomState]]=None,
**kwargs: dict,
) -> Split:
"""
Expand Down Expand Up @@ -1003,8 +1017,7 @@ def _load_file(file_path: Path) -> pd.DataFrame:
)


def _determine_delimiter(file_path):
print(file_path.suffixes)
def _determine_delimiter(file_path: Path) -> str:
if '.tsv' in file_path.suffixes:
return '\t'
else:
Expand Down
7 changes: 6 additions & 1 deletion coderdata/download/downloader.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# coderdata/download/downloader.py

from importlib import resources
from pathlib import Path
from os import PathLike
import os
import requests
import warnings

import yaml

def download(
name: str=None,
local_path: PathLike=Path.cwd(),
Expand Down Expand Up @@ -44,7 +47,9 @@ def download(
if not local_path.exists():
Path.mkdir(local_path)
# Get the dataset details
url = "https://api.figshare.com/v2/articles/25033697"
with resources.open_text('coderdata', 'dataset.yml') as f:
data_information = yaml.load(f, Loader=yaml.FullLoader)
url = data_information['figshare']

response = requests.get(url)
if response.status_code != 200:
Expand Down
49 changes: 0 additions & 49 deletions coderdata/download/figshare_latest.yml

This file was deleted.

21 changes: 14 additions & 7 deletions coderdata/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from importlib import resources
import yaml

from typing import Union

from .. import __version__
from .. import __version_tuple__

Expand All @@ -19,13 +21,18 @@ def version() -> dict:
dict
Contains package and dataset build version.
"""
with resources.open_text('coderdata', 'dataset.yml') as f:
data_information = yaml.load(f, Loader=yaml.FullLoader)
return {
'package' : __version__,
'dataset' : f"{__version_tuple__[0]}.{__version_tuple__[1]}"
# getting the dataset version from 'dataset.yml'
'dataset' : data_information['version'],
# exprapolating the dataset version from the api version number
# 'dataset' : f"{__version_tuple__[0]}.{__version_tuple__[1]}"
}


def list_datasets(raw: bool=False) -> dict | None:
def list_datasets(raw: bool=False) -> Union[dict, None]:
"""
Hepler function that returns a list of available datasets including
a short description and additional information available.
Expand All @@ -43,11 +50,11 @@ def list_datasets(raw: bool=False) -> dict | None:
Returns a dict containing the information if ``raw==True``,
otherwise prints information to stdout and returns `None`.
"""
with resources.open_text('coderdata', 'datasets.yml') as f:
datasets = yaml.load(f, Loader=yaml.FullLoader)
with resources.open_text('coderdata', 'dataset.yml') as f:
data_information = yaml.load(f, Loader=yaml.FullLoader)
if raw:
return datasets
return data_information['datasets']
else:
datasets = datasets['datasets']
datasets = data_information['datasets']
for dataset in datasets:
print(f'{dataset}: "{datasets[dataset]['description']}"')
print(f'{dataset}: {datasets[dataset]["description"]}')
61 changes: 61 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
[build-system]
requires = [
"hatchling",
"hatch-vcs",
]
build-backend = "hatchling.build"

[project]
name = "coderdata"
description = "A package to download, load, and process multiple benchmark multi-omic drug response datasets"

requires-python = ">=3.9"
authors = [
{ name = "Jeremy Jacobson", email = "[email protected]" },
{ name = "Yannick Mahlich", email = "[email protected]" },
{ name = "Sara Gosline", email = "[email protected]"}
]
classifiers = [
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Bio-Informatics",
]
dependencies = [
"numpy",
"pandas",
"requests",
"scikit-learn",
"pyyaml",
]
dynamic = [
"version",
]
readme = "README.md"
license = {text = "2-clause BSD"}

[project.scripts]
coderdata = "coderdata.cli:main"

[project.urls]
Homepage = "https://github.com/PNNL-CompBio/candleDataProcessing"
Documentation = "https://pnnl-compbio.github.io/coderdata/"
Repository = "https://github.com/PNNL-CompBio/coderdata.git"
Issues = "https://github.com/PNNL-CompBio/coderdata/issues"

[tool.hatch.version]
source = "vcs"

[tool.hatch.build.hooks.vcs]
version-file = "coderdata/_version.py"

[tool.hatch.build.targets.sdist]
include = [
"/coderdata",
]
29 changes: 0 additions & 29 deletions setup.py

This file was deleted.