Skip to content

Commit

Permalink
Fix alignment issue with groupby index accessors (#1142)
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Oct 8, 2024
1 parent 476e037 commit 25719af
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
5 changes: 4 additions & 1 deletion dask_expr/_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1326,11 +1326,14 @@ def _clean_by_expr(obj, by):
return by.name
elif isinstance(by, Index) and by._name == obj.index._name:
return by.expr
elif isinstance(by, Series):
elif isinstance(by, (Series, Index)):
if not are_co_aligned(obj.expr, by.expr):
raise NotImplementedError(
"by must be in the DataFrames columns or aligned with the DataFrame."
)
if isinstance(by, Index):
by = by.to_series()
by.index = obj.index
return by.expr

# By is a column name, e.g. str or int
Expand Down
16 changes: 16 additions & 0 deletions dask_expr/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1025,3 +1025,19 @@ def test_get_group_multiple_keys():
result = df.groupby(["x", "y"]).get_group((1, 1))
expected = pdf.groupby(["x", "y"]).get_group((1, 1))
assert_eq(result, expected)


def test_groupby_index_modified_divisions():
date_range = pd.date_range(start="2023-01-01", end="2023-01-02", freq="1min")
data = {
"timestamp": date_range,
"upper_bound_enter": [1] * len(date_range),
"vwap": [2] * len(date_range),
}
pdf = pd.DataFrame(data, index=pd.Index(date_range, name="timestamp"))

df = from_pandas(pdf, npartitions=8).repartition(freq="1D")
assert_eq(
df.groupby(df.index.dt.date).count(),
pdf.groupby(pdf.index.date).count(),
)

0 comments on commit 25719af

Please sign in to comment.