Skip to content

Commit

Permalink
Add ensure_nullable option, tests, and docstring notes
Browse files Browse the repository at this point in the history
  • Loading branch information
nvictus committed Jan 3, 2024
1 parent fd4dbe8 commit 275a311
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 2 deletions.
31 changes: 31 additions & 0 deletions bioframe/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,7 @@ def overlap(
cols1=None,
cols2=None,
on=None,
ensure_nullable=False,
):
"""
Find pairs of overlapping genomic intervals.
Expand Down Expand Up @@ -436,16 +437,46 @@ def overlap(
when considering overlaps. A common use would be passing on=['strand'].
Default is None.
ensure_nullable : bool
If True, ensures that the output dataframe uses nullable Pandas
integer dtypes for start and end coordinates. This may involve
converting coordinate columns in the input dataframes.
Default False.
Returns
-------
df_overlap : pandas.DataFrame
Notes
-----
By default, the dtypes of the `start` and `end` coordinate columns
returned in the output dataframe are preserved from the input dataframes,
following native type casting rules if missing data are introduced.
This means, for example, that if `df1` uses a NumPy integer dtype for
`start` and/or `end`, the output dataframe will use the same dtype after
an inner join, but, due to casting rules, may produce ``float64`` after a
left/right/outer join with missing data stored as ``NaN``. On the other
hand, if `df1` uses Pandas nullable dtypes, the corresponding coordinate
columns will preserve the same dtype in the output, with missing data
stored as ``NA``. If ``ensure_nullable`` is True, the output dataframe will
always return Pandas nullable dtypes for start and end coordinates.
"""
ck1, sk1, ek1 = _get_default_colnames() if cols1 is None else cols1
ck2, sk2, ek2 = _get_default_colnames() if cols2 is None else cols2
checks.is_bedframe(df1, raise_errors=True, cols=[ck1, sk1, ek1])
checks.is_bedframe(df2, raise_errors=True, cols=[ck2, sk2, ek2])

if ensure_nullable:
df1 = df1.assign(**{
sk1: df1[sk1].convert_dtypes(),
ek1: df1[ek1].convert_dtypes(),
})
df2 = df2.assign(**{
sk2: df2[sk2].convert_dtypes(),
ek2: df2[ek2].convert_dtypes(),
})

if (how == "left") and (keep_order is None):
keep_order = True
if (how != "left") and keep_order:
Expand Down
44 changes: 42 additions & 2 deletions tests/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,8 +542,11 @@ def test_overlap_preserves_coord_dtypes():

# inner join - left keeps non-nullable numpy uint32
overlap_dtypes = bioframe.overlap(df1, df2, how="inner").dtypes
assert (df1.dtypes == overlap_dtypes[:4]).all()
assert (df2.dtypes == overlap_dtypes[4:].rename(lambda x: x.replace("_", ""))).all()
overlap_dtypes = bioframe.overlap(df1, df2, how="inner").dtypes
for col in ["start", "end"]:
assert overlap_dtypes[col] == np.uint32
for col in ["start_", "end_"]:
assert overlap_dtypes[col] == pd.Int64Dtype()

# outer join - left uint32 gets cast to numpy float64
overlap_dtypes = bioframe.overlap(df1, df2, how="outer").dtypes
Expand Down Expand Up @@ -574,6 +577,43 @@ def test_overlap_preserves_coord_dtypes():
assert overlap_dtypes["end_"] == pd.Int64Dtype()


def test_overlap_ensure_nullable_coords():
df1 = pd.DataFrame(
[
["chr1", 8, 12, "+"],
["chr1", 7, 10, "-"],
["chrX", 1, 8, "+"],
],
columns=["chrom", "start", "end", "strand"],
).astype({"start": np.uint32, "end": np.uint32})
df2 = pd.DataFrame(
[
["chr1", 6, 10, "+"],
[pd.NA, pd.NA, pd.NA, "-"],
["chrX", 7, 10, "-"]
],
columns=["chrom", "start", "end", "strand"],
).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()})

# inner join - left uint32 gets cast to UInt32
overlap_dtypes = bioframe.overlap(
df1, df2, how="inner", ensure_nullable=True
).dtypes
for col in ["start", "end"]:
assert overlap_dtypes[col] == pd.UInt32Dtype()
for col in ["start_", "end_"]:
assert overlap_dtypes[col] == pd.Int64Dtype()

# outer join - left uint32 gets cast to UInt32 before the join
overlap_dtypes = bioframe.overlap(
df1, df2, how="outer", ensure_nullable=True
).dtypes
for col in ["start", "end"]:
assert overlap_dtypes[col] == pd.UInt32Dtype()
for col in ["start_", "end_"]:
assert overlap_dtypes[col] == pd.Int64Dtype()


def test_cluster():
df1 = pd.DataFrame(
[
Expand Down

0 comments on commit 275a311

Please sign in to comment.