updates from main (#177)

open2c · Dec 22, 2023 · db7c715 · db7c715
1 parent cd05f40
commit db7c715
Show file tree

Hide file tree

Showing 12 changed files with 389 additions and 196 deletions.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -7,9 +7,12 @@ on:
 jobs:
   Publish:
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Setup Python
         uses: actions/setup-python@v4
@@ -26,7 +29,3 @@ jobs:
 
       - name: Publish distribution 📦 to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          user: ${{ secrets.PYPI_USERNAME }}
-          password: ${{ secrets.PYPI_PASSWORD }}
-
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -4,7 +4,10 @@
 
 # Required
 version: 2
-
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.10"
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
   configuration: docs/conf.py

diff --git a/CHANGES.md b/CHANGES.md
@@ -1,8 +1,17 @@
 # Release notes
 
+## [Upcoming release](https://github.com/open2c/bioframe/compare/v0.5.1...HEAD)
+
+## [v0.5.1](https://github.com/open2c/bioframe/compare/v0.5.0...v0.5.1)
+Date 2023-11-08
+
+Bug fixes:
+* Series are treated like dict in `make_chromarms`
+
+
 ## [v0.5.0](https://github.com/open2c/bioframe/compare/v0.4.1...v0.5.0)
 
-Date 2023-05-10
+Date 2023-10-05
 
 API changes:
 * New builtin curated genome assembly database (metadata, chromsizes, cytobands):

diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,78 @@
+cff-version: 1.2.0
+type: software
+title: bioframe
+license: MIT
+repository-code: 'https://github.com/open2c/bioframe'
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+authors:
+  - given-names: Nezar
+    family-names: Abdennur
+    orcid: 'https://orcid.org/0000-0001-5814-0864'
+  - given-names: Geoffrey
+    family-names: Fudenberg
+    orcid: "https://orcid.org/0000-0001-5905-6517"
+  - given-names: Ilya
+    family-names: Flyamer
+    orcid: "https://orcid.org/0000-0002-4892-4208"
+  - given-names: Aleksandra
+    family-names: Galitsyna
+    orcid: "https://orcid.org/0000-0001-8969-5694"
+  - given-names: Anton
+    family-names: Goloborodko
+    orcid: "https://orcid.org/0000-0002-2210-8616"
+  - given-names: Maxim
+    family-names: Imakaev
+    orcid: "https://orcid.org/0000-0002-5320-2728"
+  - given-names: Sergey
+    family-names: Venev
+    orcid: "https://orcid.org/0000-0002-1507-7460"
+abstract: >-
+  Bioframe is a library to enable flexible and performant
+  operations on genomic interval data frames in Python.
+keywords:
+  - bioinformatics
+  - genomics
+  - ranges
+  - intervals
+  - dataframes
+  - pandas
+  - numpy
+  - Python
+identifiers:
+  - type: doi
+    value: 10.5281/zenodo.3897573
+    description: Zenodo
+  - type: doi
+    value: 10.1101/2022.02.16.480748
+    description: bioRxiv
+preferred-citation:
+  type: article
+  title: "Bioframe: Operations on Genomic Intervals in Pandas Dataframes"
+  authors:
+    - given-names: Open2C
+    - given-names: Nezar
+      family-names: Abdennur
+      orcid: 'https://orcid.org/0000-0001-5814-0864'
+    - given-names: Geoffrey
+      family-names: Fudenberg
+      orcid: "https://orcid.org/0000-0001-5905-6517"
+    - given-names: Ilya
+      family-names: Flyamer
+      orcid: "https://orcid.org/0000-0002-4892-4208"
+    - given-names: Aleksandra
+      family-names: Galitsyna
+      orcid: "https://orcid.org/0000-0001-8969-5694"
+    - given-names: Anton
+      family-names: Goloborodko
+      orcid: "https://orcid.org/0000-0002-2210-8616"
+    - given-names: Maxim
+      family-names: Imakaev
+      orcid: "https://orcid.org/0000-0002-5320-2728"
+    - given-names: Sergey
+      family-names: Venev
+      orcid: "https://orcid.org/0000-0002-1507-7460"
+  journal: bioRxiv
+  year: 2022
+  doi: "10.1101/2022.02.16.480748"
diff --git a/README.md b/README.md
@@ -19,10 +19,6 @@ bioRxiv 2022.02.16.480748; doi: https://doi.org/10.1101/2022.02.16.480748
 
 
 ## Installation
-The following are required before installing bioframe:
-* Python 3.7+
-* `numpy`
-* `pandas>=1.3`
 
 ```sh
 pip install bioframe
@@ -47,13 +43,13 @@ bf.overlap(df1, df2)
 
 For these two input dataframes, with intervals all on the same chromosome:
 
-<img src="./docs/figs/df1.png" width=60%> 
-<img src="./docs/figs/df2.png" width=60%> 
+<img src="https://github.com/open2c/bioframe/raw/main/docs/figs/df1.png" width=60%> 
+<img src="https://github.com/open2c/bioframe/raw/main/docs/figs/df2.png" width=60%> 
 
 `overlap` will return the following interval pairs as overlaps:
 
-<img src="./docs/figs/overlap_inner_0.png" width=60%> 
-<img src="./docs/figs/overlap_inner_1.png" width=60%> 
+<img src="https://github.com/open2c/bioframe/raw/main/docs/figs/overlap_inner_0.png" width=60%> 
+<img src="https://github.com/open2c/bioframe/raw/main/docs/figs/overlap_inner_1.png" width=60%> 
 
 
 To `merge` all overlapping intervals in a dataframe, call:
@@ -65,11 +61,11 @@ bf.merge(df1)
 
 For this input dataframe, with intervals all on the same chromosome:
 
-<img src="./docs/figs/df1.png" width=60%> 
+<img src="https://github.com/open2c/bioframe/raw/main/docs/figs/df1.png" width=60%> 
 
 `merge` will return a new dataframe with these merged intervals:
 
-<img src="./docs/figs/merge_df1.png" width=60%> 
+<img src="https://github.com/open2c/bioframe/raw/main/docs/figs/merge_df1.png" width=60%> 
 
 See the [guide](https://bioframe.readthedocs.io/en/latest/guide-intervalops.html) for visualizations of other interval operations in bioframe.
 
@@ -78,14 +74,9 @@ See the [guide](https://bioframe.readthedocs.io/en/latest/guide-intervalops.html
 Bioframe includes utilities for reading genomic file formats into dataframes and vice versa. One handy function is `read_table` which mirrors pandas’s read_csv/read_table but provides a [`schema`](https://github.com/open2c/bioframe/blob/main/bioframe/io/schemas.py) argument to populate column names for common tabular file formats.
 
 ```python
-jaspar_url = 'http://expdata.cmmt.ubc.ca/JASPAR/downloads/UCSC_tracks/2018/hg38/tsv/MA0139.1.tsv.gz'
+jaspar_url = 'http://expdata.cmmt.ubc.ca/JASPAR/downloads/UCSC_tracks/2022/hg38/MA0139.1.tsv.gz'
 ctcf_motif_calls = bioframe.read_table(jaspar_url, schema='jaspar', skiprows=1)
 ```
 
 ## Tutorials
 See this [jupyter notebook](https://github.com/open2c/bioframe/tree/master/docs/tutorials/tutorial_assign_motifs_to_peaks.ipynb) for an example of how to assign TF motifs to ChIP-seq peaks using bioframe. 
-
-## Projects currently using bioframe:
-* [cooler](https://github.com/open2c/cooler)
-* [cooltools](https://github.com/open2c/cooltools)
-* yours? :)
diff --git a/bioframe/_version.py b/bioframe/_version.py
@@ -1 +1 @@
-__version__ = "0.5.0"
+__version__ = "0.5.1"
diff --git a/bioframe/extras.py b/bioframe/extras.py
@@ -28,15 +28,15 @@ def make_chromarms(
 
     Parameters
     ----------
-    chromsizes : pandas.Dataframe or pandas.Series
-        If pandas.Series, a map from chromosomes to lengths in bp.
+    chromsizes : pandas.Dataframe or dict-like
+        If dict or pandas.Series, a map from chromosomes to lengths in bp.
         If pandas.Dataframe, a dataframe with columns defined by cols_chroms.
         If cols_chroms is a triplet (e.g. 'chrom','start','end'), then
         values in chromsizes[cols_chroms[1]].values must all be zero.
 
     midpoints : pandas.Dataframe or dict-like
         Mapping of chromosomes to midpoint (aka centromere) locations.
-        If pandas.Series, a map from chromosomes to midpoints in bp.
+        If dict or pandas.Series, a map from chromosomes to midpoints in bp.
         If pandas.Dataframe, a dataframe with columns defined by cols_mids.
 
     cols_chroms : (str, str) or (str, str, str)
@@ -59,9 +59,13 @@ def make_chromarms(
     elif len(cols_chroms) == 3:
         ck1, sk1, ek1 = cols_chroms
 
-    if isinstance(chromsizes, pd.Series):
+    if isinstance(chromsizes, (pd.Series, dict)):
+        chromsizes = dict(chromsizes)
         df_chroms = (
-            pd.DataFrame(chromsizes).reset_index().rename(columns={"index": ck1})
+            pd.DataFrame({
+                ck1: list(chromsizes.keys()),
+                "length": list(chromsizes.values()),
+            })
         )
     elif isinstance(chromsizes, pd.DataFrame):
         df_chroms = chromsizes.copy()
@@ -83,7 +87,8 @@ def make_chromarms(
         raise ValueError("invalid number of cols_chroms")
 
     ck2, sk2 = cols_mids
-    if isinstance(midpoints, dict):
+    if isinstance(midpoints, (pd.Series, dict)):
+        midpoints = dict(midpoints)
         df_mids = pd.DataFrame.from_dict(midpoints, orient="index", columns=[sk2])
         df_mids.reset_index(inplace=True)
         df_mids.rename(columns={"index": ck2}, inplace=True)

diff --git a/bioframe/ops.py b/bioframe/ops.py
@@ -403,6 +403,9 @@ def overlap(
     return_overlap : bool
         If True, return overlapping intervals for the overlapping pairs
         as two additional columns (`overlap_start`, `overlap_end`).
+        When `cols1` is modified, `start` and `end` are replaced accordingly.
+        When `return_overlap` is a string, its value is used for naming the overlap
+        columns: `return_overlap + "_start"`, `return_overlap + "_end"`.
         Default False.
 
     suffixes : (str, str)

diff --git a/docs/guide-intervalops.md b/docs/guide-intervalops.md
@@ -64,7 +64,7 @@ BedFrames satisfy the following properties:
 - chrom, start, end columns  
 - columns have valid dtypes (object/string/categorical, int/pd.Int64Dtype(), int/pd.Int64Dtype())  
 - for each interval, if any of chrom, start, end are null, then all are null
-- all starts < ends.  
+- all starts <= ends.  
 
 Whether a dataframe satisfies these properties can be checked with :func:`bioframe.core.checks.is_bedframe`:
 ```

diff --git a/docs/guide-performance.ipynb b/docs/guide-performance.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -69,7 +69,7 @@ docs = [
 homepage = "https://github.com/open2c/bioframe"
 documentation = "https://bioframe.readthedocs.io/en/latest"
 repository = "https://github.com/open2c/bioframe"
-changelog = "https://github.com/open2c/bioframe/blob/master/CHANGES.md"
+changelog = "https://github.com/open2c/bioframe/blob/main/CHANGES.md"
 
 [tool.hatch.version]
 path = "bioframe/_version.py"

diff --git a/tests/test_extras.py b/tests/test_extras.py
@@ -12,46 +12,63 @@
 def test_make_chromarms():
 
     ### test the case where columns have different names
-    df1 = pd.DataFrame(
+    df = pd.DataFrame(
         [["chrX", 0, 8]],
         columns=["chromosome", "lo", "hi"],
     )
-
-    df2 = pd.DataFrame([["chrX", 4]], columns=["chromosome", "loc"])
-
-    df_result = pd.DataFrame(
+    mids = pd.DataFrame([["chrX", 4]], columns=["chromosome", "loc"])
+    arms = pd.DataFrame(
         [
             ["chrX", 0, 4, "chrX_p"],
             ["chrX", 4, 8, "chrX_q"],
         ],
-        columns=["chromosome", "lo", "hi", "name"],
+        columns=["chrom", "start", "end", "name"],
     )
+    arms = arms.astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()})
 
     # test passing 3 columns
+    result = bioframe.make_chromarms(
+        df,
+        mids,
+        cols_chroms=["chromosome", "lo", "hi"],
+        cols_mids=["chromosome", "loc"],
+    )
     pd.testing.assert_frame_equal(
-        df_result.astype({"lo": pd.Int64Dtype(), "hi": pd.Int64Dtype()}),
-        bioframe.make_chromarms(
-            df1,
-            df2,
-            cols_chroms=["chromosome", "lo", "hi"],
-            cols_mids=["chromosome", "loc"],
-        ),
+        result,
+        arms.rename(columns={"chrom": "chromosome", "start": "lo", "end": "hi"})
     )
 
     # test passing 2 columns
+    result = bioframe.make_chromarms(
+        df,
+        mids,
+        cols_chroms=["chromosome", "hi"],
+        cols_mids=["chromosome", "loc"],
+    )
     pd.testing.assert_frame_equal(
-        df_result.astype({"lo": pd.Int64Dtype(), "hi": pd.Int64Dtype()}).rename(
-            columns={"lo": "start", "hi": "end"}
-        ),
-        bioframe.make_chromarms(
-            df1,
-            df2,
-            cols_chroms=["chromosome", "hi"],
-            cols_mids=["chromosome", "loc"],
-        ),
+        result,
+        arms.rename(columns={"chrom": "chromosome"}),
     )
 
-    # todo: test for passing pd.series !
+    # test for passing Series or dict
+    result = bioframe.make_chromarms(
+        pd.Series({"chrX": 8}),
+        mids,
+        cols_mids=["chromosome", "loc"]
+    )
+    pd.testing.assert_frame_equal(arms, result)
+
+    result = bioframe.make_chromarms(pd.Series({"chrX": 8}), pd.Series({"chrX": 4}))
+    pd.testing.assert_frame_equal(arms, result)
+
+    bioframe.make_chromarms({"chrX": 8}, mids, cols_mids=["chromosome", "loc"])
+    pd.testing.assert_frame_equal(arms, result)
+
+    bioframe.make_chromarms({"chrX": 8}, pd.Series({"chrX": 4}))
+    pd.testing.assert_frame_equal(arms, result)
+
+    bioframe.make_chromarms({"chrX": 8}, {"chrX": 4})
+    pd.testing.assert_frame_equal(arms, result)
 
 
 def test_binnify():