Add ensure_nullable option, tests, and docstring notes

open2c · Jan 3, 2024 · 275a311 · 275a311
1 parent fd4dbe8
commit 275a311
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 2 deletions.
diff --git a/bioframe/ops.py b/bioframe/ops.py
@@ -385,6 +385,7 @@ def overlap(
     cols1=None,
     cols2=None,
     on=None,
+    ensure_nullable=False,
 ):
     """
     Find pairs of overlapping genomic intervals.
@@ -436,16 +437,46 @@ def overlap(
         when considering overlaps. A common use would be passing on=['strand'].
         Default is None.
 
+    ensure_nullable : bool
+        If True, ensures that the output dataframe uses nullable Pandas
+        integer dtypes for start and end coordinates. This may involve
+        converting coordinate columns in the input dataframes.
+        Default False.
+
     Returns
     -------
     df_overlap : pandas.DataFrame
 
+    Notes
+    -----
+    By default, the dtypes of the `start` and `end` coordinate columns
+    returned in the output dataframe are preserved from the input dataframes,
+    following native type casting rules if missing data are introduced.
+
+    This means, for example, that if `df1` uses a NumPy integer dtype for
+    `start` and/or `end`, the output dataframe will use the same dtype after
+    an inner join, but, due to casting rules, may produce ``float64`` after a
+    left/right/outer join with missing data stored as ``NaN``. On the other
+    hand, if `df1` uses Pandas nullable dtypes, the corresponding coordinate
+    columns will preserve the same dtype in the output, with missing data
+    stored as ``NA``. If ``ensure_nullable`` is True, the output dataframe will
+    always return Pandas nullable dtypes for start and end coordinates.
     """
     ck1, sk1, ek1 = _get_default_colnames() if cols1 is None else cols1
     ck2, sk2, ek2 = _get_default_colnames() if cols2 is None else cols2
     checks.is_bedframe(df1, raise_errors=True, cols=[ck1, sk1, ek1])
     checks.is_bedframe(df2, raise_errors=True, cols=[ck2, sk2, ek2])
 
+    if ensure_nullable:
+        df1 = df1.assign(**{
+            sk1: df1[sk1].convert_dtypes(),
+            ek1: df1[ek1].convert_dtypes(),
+        })
+        df2 = df2.assign(**{
+            sk2: df2[sk2].convert_dtypes(),
+            ek2: df2[ek2].convert_dtypes(),
+        })
+
     if (how == "left") and (keep_order is None):
         keep_order = True
     if (how != "left") and keep_order:

diff --git a/tests/test_ops.py b/tests/test_ops.py
@@ -542,8 +542,11 @@ def test_overlap_preserves_coord_dtypes():
 
     # inner join - left keeps non-nullable numpy uint32
     overlap_dtypes = bioframe.overlap(df1,  df2, how="inner").dtypes
-    assert (df1.dtypes == overlap_dtypes[:4]).all()
-    assert (df2.dtypes == overlap_dtypes[4:].rename(lambda x: x.replace("_", ""))).all()
+    overlap_dtypes = bioframe.overlap(df1,  df2, how="inner").dtypes
+    for col in ["start", "end"]:
+        assert overlap_dtypes[col] == np.uint32
+    for col in ["start_", "end_"]:
+        assert overlap_dtypes[col] == pd.Int64Dtype()
 
     # outer join - left uint32 gets cast to numpy float64
     overlap_dtypes = bioframe.overlap(df1,  df2, how="outer").dtypes
@@ -574,6 +577,43 @@ def test_overlap_preserves_coord_dtypes():
     assert overlap_dtypes["end_"] == pd.Int64Dtype()
 
 
+def test_overlap_ensure_nullable_coords():
+    df1 = pd.DataFrame(
+        [
+            ["chr1", 8, 12, "+"],
+            ["chr1", 7, 10, "-"],
+            ["chrX", 1, 8, "+"],
+        ],
+        columns=["chrom", "start", "end", "strand"],
+    ).astype({"start": np.uint32, "end": np.uint32})
+    df2 = pd.DataFrame(
+        [
+            ["chr1", 6, 10, "+"],
+            [pd.NA, pd.NA, pd.NA, "-"],
+            ["chrX", 7, 10, "-"]
+        ],
+        columns=["chrom", "start", "end", "strand"],
+    ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()})
+
+    # inner join - left uint32 gets cast to UInt32
+    overlap_dtypes = bioframe.overlap(
+        df1,  df2, how="inner", ensure_nullable=True
+    ).dtypes
+    for col in ["start", "end"]:
+        assert overlap_dtypes[col] == pd.UInt32Dtype()
+    for col in ["start_", "end_"]:
+        assert overlap_dtypes[col] == pd.Int64Dtype()
+
+    # outer join - left uint32 gets cast to UInt32 before the join
+    overlap_dtypes = bioframe.overlap(
+        df1,  df2, how="outer", ensure_nullable=True
+    ).dtypes
+    for col in ["start", "end"]:
+        assert overlap_dtypes[col] == pd.UInt32Dtype()
+    for col in ["start_", "end_"]:
+        assert overlap_dtypes[col] == pd.Int64Dtype()
+
+
 def test_cluster():
     df1 = pd.DataFrame(
         [