Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve coordinate dtypes #178

Merged
merged 5 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 65 additions & 39 deletions bioframe/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,11 +274,18 @@ def _overlap_intidxs(df1, df2, how="left", cols1=None, cols2=None, on=None):
if on is not None:
group_list1 += on
group_list2 += on
df1_groups = df1.groupby(group_list1, observed=True, dropna=False).indices

df1_groups = df1.groupby(group_list1, observed=True, dropna=False).indices
df2_groups = df2.groupby(group_list2, observed=True, dropna=False).indices
all_groups = set.union(set(df1_groups), set(df2_groups))

# Extract coordinate columns
# .values will return a numpy array or pandas array, depending on the dtype
starts1 = df1[sk1].values
ends1 = df1[ek1].values
starts2 = df2[sk2].values
ends2 = df2[ek2].values

# Find overlapping intervals per group (determined by chrom and on).
overlap_intidxs = []
for group_keys in all_groups:
Expand All @@ -293,11 +300,12 @@ def _overlap_intidxs(df1, df2, how="left", cols1=None, cols2=None, on=None):
both_groups_nonempty = (df1_group_idxs.size > 0) and (df2_group_idxs.size > 0)

if both_groups_nonempty:

overlap_idxs_loc = arrops.overlap_intervals(
df1[sk1].values[df1_group_idxs],
df1[ek1].values[df1_group_idxs],
df2[sk2].values[df2_group_idxs],
df2[ek2].values[df2_group_idxs],
starts1[df1_group_idxs],
ends1[df1_group_idxs],
starts2[df2_group_idxs],
ends2[df2_group_idxs],
)

# Convert local per-chromosome indices into the
Expand Down Expand Up @@ -377,6 +385,7 @@ def overlap(
cols1=None,
cols2=None,
on=None,
ensure_nullable=False,
):
"""
Find pairs of overlapping genomic intervals.
Expand Down Expand Up @@ -428,17 +437,46 @@ def overlap(
when considering overlaps. A common use would be passing on=['strand'].
Default is None.

ensure_nullable : bool
If True, ensures that the output dataframe uses nullable Pandas
integer dtypes for start and end coordinates. This may involve
converting coordinate columns in the input dataframes.
Default False.

Returns
-------
df_overlap : pandas.DataFrame

Notes
-----
By default, the dtypes of the `start` and `end` coordinate columns
returned in the output dataframe are preserved from the input dataframes,
following native type casting rules if missing data are introduced.

This means, for example, that if `df1` uses a NumPy integer dtype for
`start` and/or `end`, the output dataframe will use the same dtype after
an inner join, but, due to casting rules, may produce ``float64`` after a
left/right/outer join with missing data stored as ``NaN``. On the other
hand, if `df1` uses Pandas nullable dtypes, the corresponding coordinate
columns will preserve the same dtype in the output, with missing data
stored as ``NA``. If ``ensure_nullable`` is True, the output dataframe will
always return Pandas nullable dtypes for start and end coordinates.
"""

ck1, sk1, ek1 = _get_default_colnames() if cols1 is None else cols1
ck2, sk2, ek2 = _get_default_colnames() if cols2 is None else cols2
checks.is_bedframe(df1, raise_errors=True, cols=[ck1, sk1, ek1])
checks.is_bedframe(df2, raise_errors=True, cols=[ck2, sk2, ek2])

if ensure_nullable:
df1 = df1.assign(**{
sk1: df1[sk1].convert_dtypes(),
ek1: df1[ek1].convert_dtypes(),
})
df2 = df2.assign(**{
sk2: df2[sk2].convert_dtypes(),
ek2: df2[ek2].convert_dtypes(),
})

if (how == "left") and (keep_order is None):
keep_order = True
if (how != "left") and keep_order:
Expand All @@ -460,13 +498,15 @@ def overlap(
cols2=cols2,
on=on,
)
events1 = overlap_df_idxs[:, 0]
events2 = overlap_df_idxs[:, 1]

# Generate output tables.
index_col = return_index if isinstance(return_index, str) else "index"
index_col_1 = index_col + suffixes[0]
index_col_2 = index_col + suffixes[1]
df_index_1 = pd.DataFrame({index_col_1: df1.index[overlap_df_idxs[:, 0]]})
df_index_2 = pd.DataFrame({index_col_2: df2.index[overlap_df_idxs[:, 1]]})
df_index_1 = pd.DataFrame({index_col_1: df1.index[events1]})
df_index_2 = pd.DataFrame({index_col_2: df2.index[events2]})

df_overlap = None
if return_overlap:
Expand All @@ -475,58 +515,44 @@ def overlap(
overlap_col_ek1 = overlap_col + "_" + ek1

overlap_start = np.maximum(
df1[sk1].values[overlap_df_idxs[:, 0]],
df2[sk2].values[overlap_df_idxs[:, 1]],
df1[sk1].values[events1],
df2[sk2].values[events2],
)

overlap_end = np.minimum(
df1[ek1].values[overlap_df_idxs[:, 0]],
df2[ek2].values[overlap_df_idxs[:, 1]],
df1[ek1].values[events1],
df2[ek2].values[events2],
)

df_overlap = pd.DataFrame(
{overlap_col_sk1: overlap_start, overlap_col_ek1: overlap_end}
{
overlap_col_sk1: overlap_start,
overlap_col_ek1: overlap_end
}
)

df_input_1 = None
df_input_2 = None
if return_input or str(return_input) == "1" or return_input == "left":
df_input_1 = df1.iloc[overlap_df_idxs[:, 0]].reset_index(drop=True)
df_input_1 = df1.iloc[events1].reset_index(drop=True)
df_input_1.columns = [c + suffixes[0] for c in df_input_1.columns]
if return_input or str(return_input) == "2" or return_input == "right":
df_input_2 = df2.iloc[overlap_df_idxs[:, 1]].reset_index(drop=True)
df_input_2 = df2.iloc[events2].reset_index(drop=True)
df_input_2.columns = [c + suffixes[1] for c in df_input_2.columns]

# Masking non-overlapping regions if using non-inner joins.
if how != "inner":
df_index_1[overlap_df_idxs[:, 0] == -1] = None
df_index_1 = df_index_1.astype({index_col_1: pd.Int64Dtype()})
df_index_2[overlap_df_idxs[:, 1] == -1] = None
df_index_2 = df_index_2.astype({index_col_2: pd.Int64Dtype()})
df_index_1[events1 == -1] = None
df_index_2[events2 == -1] = None

if df_input_1 is not None:
df_input_1[overlap_df_idxs[:, 0] == -1] = None
df_input_1 = df_input_1.astype(
{
(sk1 + suffixes[0]): pd.Int64Dtype(),
(ek1 + suffixes[0]): pd.Int64Dtype(),
}
)
df_input_1[events1 == -1] = None

if df_input_2 is not None:
df_input_2[overlap_df_idxs[:, 1] == -1] = None
df_input_2 = df_input_2.astype(
{
(sk2 + suffixes[1]): pd.Int64Dtype(),
(ek2 + suffixes[1]): pd.Int64Dtype(),
}
)
df_input_2[events2 == -1] = None

if df_overlap is not None:
df_overlap[
(overlap_df_idxs[:, 0] == -1) | (overlap_df_idxs[:, 1] == -1)
] = None
df_overlap = df_overlap.astype(
{overlap_col_sk1: pd.Int64Dtype(), (overlap_col_ek1): pd.Int64Dtype()}
)
df_overlap[(events1 == -1) | (events2 == -1)] = None

out_df = pd.concat(
[df_index_1, df_input_1, df_index_2, df_input_2, df_overlap], axis="columns"
Expand Down
105 changes: 94 additions & 11 deletions tests/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_trim():
["chrX_0", 1, 5],
],
columns=["chrom", "startFunky", "end"],
).astype({"startFunky": pd.Int64Dtype(), "end": pd.Int64Dtype()})
)
pd.testing.assert_frame_equal(
df_trimmed,
bioframe.trim(
Expand Down Expand Up @@ -522,6 +522,98 @@ def test_overlap():
)


def test_overlap_preserves_coord_dtypes():
df1 = pd.DataFrame(
[
["chr1", 8, 12, "+"],
["chr1", 7, 10, "-"],
["chrX", 1, 8, "+"],
],
columns=["chrom", "start", "end", "strand"],
).astype({"start": np.uint32, "end": np.uint32})
df2 = pd.DataFrame(
[
["chr1", 6, 10, "+"],
[pd.NA, pd.NA, pd.NA, "-"],
["chrX", 7, 10, "-"]
],
columns=["chrom", "start", "end", "strand"],
).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()})

# inner join - left keeps non-nullable numpy uint32
overlap_dtypes = bioframe.overlap(df1, df2, how="inner").dtypes
overlap_dtypes = bioframe.overlap(df1, df2, how="inner").dtypes
for col in ["start", "end"]:
assert overlap_dtypes[col] == np.uint32
for col in ["start_", "end_"]:
assert overlap_dtypes[col] == pd.Int64Dtype()

# outer join - left uint32 gets cast to numpy float64
overlap_dtypes = bioframe.overlap(df1, df2, how="outer").dtypes
assert overlap_dtypes["start"] == np.float64
assert overlap_dtypes["end"] == np.float64
assert overlap_dtypes["start_"] == pd.Int64Dtype()
assert overlap_dtypes["end_"] == pd.Int64Dtype()

# convert left coords to nullable *before* joining
overlap_dtypes = bioframe.overlap(df1.convert_dtypes(), df2, how="inner").dtypes
assert overlap_dtypes["start"] == pd.UInt32Dtype()
assert overlap_dtypes["end"] == pd.UInt32Dtype()
assert overlap_dtypes["start_"] == pd.Int64Dtype()
assert overlap_dtypes["end_"] == pd.Int64Dtype()

# convert coords to nullable *after* joining
# inner join - uint32 output becomes UInt32
# outer join - float64 output becomes Int64
overlap_dtypes = bioframe.overlap(df1, df2, how="inner").convert_dtypes().dtypes
assert overlap_dtypes["start"] == pd.UInt32Dtype()
assert overlap_dtypes["end"] == pd.UInt32Dtype()
assert overlap_dtypes["start_"] == pd.Int64Dtype()
assert overlap_dtypes["end_"] == pd.Int64Dtype()
overlap_dtypes = bioframe.overlap(df1, df2, how="outer").convert_dtypes().dtypes
assert overlap_dtypes["start"] == pd.Int64Dtype()
assert overlap_dtypes["end"] == pd.Int64Dtype()
assert overlap_dtypes["start_"] == pd.Int64Dtype()
assert overlap_dtypes["end_"] == pd.Int64Dtype()


def test_overlap_ensure_nullable_coords():
df1 = pd.DataFrame(
[
["chr1", 8, 12, "+"],
["chr1", 7, 10, "-"],
["chrX", 1, 8, "+"],
],
columns=["chrom", "start", "end", "strand"],
).astype({"start": np.uint32, "end": np.uint32})
df2 = pd.DataFrame(
[
["chr1", 6, 10, "+"],
[pd.NA, pd.NA, pd.NA, "-"],
["chrX", 7, 10, "-"]
],
columns=["chrom", "start", "end", "strand"],
).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()})

# inner join - left uint32 gets cast to UInt32
overlap_dtypes = bioframe.overlap(
df1, df2, how="inner", ensure_nullable=True
).dtypes
for col in ["start", "end"]:
assert overlap_dtypes[col] == pd.UInt32Dtype()
for col in ["start_", "end_"]:
assert overlap_dtypes[col] == pd.Int64Dtype()

# outer join - left uint32 gets cast to UInt32 before the join
overlap_dtypes = bioframe.overlap(
df1, df2, how="outer", ensure_nullable=True
).dtypes
for col in ["start", "end"]:
assert overlap_dtypes[col] == pd.UInt32Dtype()
for col in ["start_", "end_"]:
assert overlap_dtypes[col] == pd.Int64Dtype()


def test_cluster():
df1 = pd.DataFrame(
[
Expand Down Expand Up @@ -1575,9 +1667,6 @@ def test_assign_view():
],
columns=["chrom", "start", "end", "strand", "view_region"],
)
df_assigned = df_assigned.astype(
{"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
)
pd.testing.assert_frame_equal(df_assigned, bioframe.assign_view(df, view_df))

# non-default columns in view
Expand Down Expand Up @@ -1620,9 +1709,6 @@ def test_assign_view():
],
columns=["chrom", "start", "end", "strand", "funny_view_region"],
)
df_assigned = df_assigned.astype(
{"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
)

pd.testing.assert_frame_equal(
df_assigned,
Expand All @@ -1641,13 +1727,10 @@ def test_assign_view():
["chr1", 0, 10, "+", "apples"],
["chrX", 5, 10, "+", "oranges"],
["chrX", 0, 5, "+", "oranges"],
["chr2", 5, 10, "+", pd.NA],
["chr2", 5, 10, "+", None],
],
columns=["chrom", "start", "end", "strand", "funny_view_region"],
)
df_assigned = df_assigned.astype(
{"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
)

pd.testing.assert_frame_equal(
df_assigned,
Expand Down