From 3ed64d3775b7b3180ed70951e912f4d5f9b9be75 Mon Sep 17 00:00:00 2001 From: Adam Amer <136176500+adamamer20@users.noreply.github.com> Date: Tue, 16 Jul 2024 10:07:29 +0200 Subject: [PATCH] Adding DataFrameMixin for improved reusability/encapsulation (#27) * change from types to types_ to avoid import issues * creation of DataFrameMixin * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removing space types (has it's own PR) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update types with types_ * Moved agentset to library folder * update __init__ * remove geopandas * removed gpd --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- mesa_frames/__init__.py | 4 +- mesa_frames/abstract/agents.py | 2 +- mesa_frames/abstract/mixin.py | 82 +++++++++- mesa_frames/concrete/agents.py | 6 +- .../agentset.py} | 7 +- mesa_frames/concrete/pandas/mixin.py | 121 +++++++++++++++ .../agentset.py} | 9 +- mesa_frames/concrete/polars/mixin.py | 145 ++++++++++++++++++ mesa_frames/{types.py => types_.py} | 2 + tests/test_agents.py | 2 +- 10 files changed, 364 insertions(+), 16 deletions(-) rename mesa_frames/concrete/{agentset_pandas.py => pandas/agentset.py} (98%) create mode 100644 mesa_frames/concrete/pandas/mixin.py rename mesa_frames/concrete/{agentset_polars.py => polars/agentset.py} (98%) create mode 100644 mesa_frames/concrete/polars/mixin.py rename mesa_frames/{types.py => types_.py} (97%) diff --git a/mesa_frames/__init__.py b/mesa_frames/__init__.py index 61f25b5..4288c36 100644 --- a/mesa_frames/__init__.py +++ b/mesa_frames/__init__.py @@ -1,6 +1,6 @@ from mesa_frames.concrete.agents import AgentsDF -from mesa_frames.concrete.agentset_pandas import AgentSetPandas -from mesa_frames.concrete.agentset_polars import AgentSetPolars +from mesa_frames.concrete.pandas.agentset import AgentSetPandas +from mesa_frames.concrete.polars.agentset import AgentSetPolars from mesa_frames.concrete.model import ModelDF __all__ = [ diff --git a/mesa_frames/abstract/agents.py b/mesa_frames/abstract/agents.py index 4217648..93baaaa 100644 --- a/mesa_frames/abstract/agents.py +++ b/mesa_frames/abstract/agents.py @@ -9,7 +9,7 @@ from typing_extensions import Any, Self, overload from mesa_frames.abstract.mixin import CopyMixin -from mesa_frames.types import BoolSeries, DataFrame, IdsLike, Index, MaskLike, Series +from mesa_frames.types_ import BoolSeries, DataFrame, IdsLike, Index, MaskLike, Series if TYPE_CHECKING: from mesa_frames.concrete.agents import AgentSetDF diff --git a/mesa_frames/abstract/mixin.py b/mesa_frames/abstract/mixin.py index a62752d..6f59e2c 100644 --- a/mesa_frames/abstract/mixin.py +++ b/mesa_frames/abstract/mixin.py @@ -1,7 +1,11 @@ from abc import ABC, abstractmethod from copy import copy, deepcopy -from typing_extensions import Self +from typing_extensions import Any, Self +from typing import Literal +from collections.abc import Collection, Iterator, Sequence + +from mesa_frames.types_ import BoolSeries, DataFrame, MaskLike, Series class CopyMixin(ABC): @@ -142,3 +146,79 @@ def __deepcopy__(self, memo: dict) -> Self: A deep copy of the AgentContainer. """ return self.copy(deep=True, memo=memo) + + +class DataFrameMixin(ABC): + @abstractmethod + def _df_add_columns( + self, original_df: DataFrame, new_columns: list[str], data: Any + ) -> DataFrame: ... + + @abstractmethod + def _df_combine_first( + self, original_df: DataFrame, new_df: DataFrame, index_cols: list[str] + ) -> DataFrame: ... + + @abstractmethod + def _df_concat( + self, + dfs: Collection[DataFrame], + how: Literal["horizontal"] | Literal["vertical"] = "vertical", + ignore_index: bool = False, + ) -> DataFrame: ... + + @abstractmethod + def _df_constructor( + self, + data: Sequence[Sequence] | dict[str | Any] | None = None, + columns: list[str] | None = None, + index_col: str | list[str] | None = None, + dtypes: dict[str, Any] | None = None, + ) -> DataFrame: ... + + @abstractmethod + def _df_get_bool_mask( + self, + df: DataFrame, + index_col: str, + mask: MaskLike | None = None, + negate: bool = False, + ) -> BoolSeries: ... + + @abstractmethod + def _df_get_masked_df( + self, + df: DataFrame, + index_col: str, + mask: MaskLike | None = None, + columns: list[str] | None = None, + negate: bool = False, + ) -> DataFrame: ... + + @abstractmethod + def _df_iterator(self, df: DataFrame) -> Iterator[dict[str, Any]]: ... + + @abstractmethod + def _df_remove( + self, df: DataFrame, ids: Sequence[Any], index_col: str | None = None + ) -> DataFrame: ... + + @abstractmethod + def _df_sample( + self, + df: DataFrame, + n: int | None = None, + frac: float | None = None, + with_replacement: bool = False, + shuffle: bool = False, + seed: int | None = None, + ) -> DataFrame: ... + + @abstractmethod + def _srs_constructor( + self, + data: Sequence[Any] | None = None, + name: str | None = None, + dtype: Any | None = None, + index: Sequence[Any] | None = None, + ) -> Series: ... diff --git a/mesa_frames/concrete/agents.py b/mesa_frames/concrete/agents.py index 14891b8..3f8530e 100644 --- a/mesa_frames/concrete/agents.py +++ b/mesa_frames/concrete/agents.py @@ -1,14 +1,12 @@ from collections import defaultdict from collections.abc import Callable, Collection, Iterable, Iterator, Sequence -from typing import Literal, cast +from typing import TYPE_CHECKING, Literal, cast import polars as pl from typing_extensions import Any, Self, overload -from typing import TYPE_CHECKING - from mesa_frames.abstract.agents import AgentContainer, AgentSetDF -from mesa_frames.types import ( +from mesa_frames.types_ import ( AgnosticMask, BoolSeries, DataFrame, diff --git a/mesa_frames/concrete/agentset_pandas.py b/mesa_frames/concrete/pandas/agentset.py similarity index 98% rename from mesa_frames/concrete/agentset_pandas.py rename to mesa_frames/concrete/pandas/agentset.py index 8f4ce4d..0378ae5 100644 --- a/mesa_frames/concrete/agentset_pandas.py +++ b/mesa_frames/concrete/pandas/agentset.py @@ -6,14 +6,15 @@ from typing_extensions import Any, Self, overload from mesa_frames.abstract.agents import AgentSetDF -from mesa_frames.concrete.agentset_polars import AgentSetPolars -from mesa_frames.types import PandasIdsLike, PandasMaskLike +from mesa_frames.concrete.pandas.mixin import PandasMixin +from mesa_frames.concrete.polars.agentset import AgentSetPolars +from mesa_frames.types_ import PandasIdsLike, PandasMaskLike if TYPE_CHECKING: from mesa_frames.concrete.model import ModelDF -class AgentSetPandas(AgentSetDF): +class AgentSetPandas(AgentSetDF, PandasMixin): _agents: pd.DataFrame _mask: pd.Series _copy_with_method: dict[str, tuple[str, list[str]]] = { diff --git a/mesa_frames/concrete/pandas/mixin.py b/mesa_frames/concrete/pandas/mixin.py new file mode 100644 index 0000000..bb1d546 --- /dev/null +++ b/mesa_frames/concrete/pandas/mixin.py @@ -0,0 +1,121 @@ +import pandas as pd +from typing_extensions import Any +from typing import Literal +from collections.abc import Collection, Iterator, Sequence + +from mesa_frames.abstract.mixin import DataFrameMixin +from mesa_frames.types_ import PandasMaskLike + + +class PandasMixin(DataFrameMixin): + def _df_add_columns( + self, original_df: pd.DataFrame, new_columns: list[str], data: Any + ) -> pd.DataFrame: + original_df[new_columns] = data + return original_df + + def _df_combine_first( + self, original_df: pd.DataFrame, new_df: pd.DataFrame, index_cols: list[str] + ) -> pd.DataFrame: + return original_df.combine_first(new_df) + + def _df_concat( + self, + dfs: Collection[pd.DataFrame], + how: Literal["horizontal"] | Literal["vertical"] = "vertical", + ignore_index: bool = False, + ) -> pd.DataFrame: + return pd.concat( + dfs, axis=0 if how == "vertical" else 1, ignore_index=ignore_index + ) + + def _df_constructor( + self, + data: Sequence[Sequence] | dict[str | Any] | None = None, + columns: list[str] | None = None, + index_col: str | list[str] | None = None, + dtypes: dict[str, Any] | None = None, + ) -> pd.DataFrame: + df = pd.DataFrame(data=data, columns=columns).astype(dtypes) + if index_col: + df.set_index(index_col) + return df + + def _df_get_bool_mask( + self, + df: pd.DataFrame, + index_col: str, + mask: PandasMaskLike = None, + negate: bool = False, + ) -> pd.Series: + if isinstance(mask, pd.Series) and mask.dtype == bool and len(mask) == len(df): + result = mask + elif isinstance(mask, pd.DataFrame): + if mask.index.name == index_col: + result = pd.Series(df.index.isin(mask.index), index=df.index) + elif index_col in mask.columns: + result = pd.Series(df.index.isin(mask[index_col]), index=df.index) + else: + raise ValueError( + f"A DataFrame mask must have a column/index with name {index_col}" + ) + elif mask is None or mask == "all": + result = pd.Series(True, index=df.index) + elif isinstance(mask, Sequence): + result = pd.Series(df.index.isin(mask), index=df.index) + else: + result = pd.Series(df.index.isin([mask]), index=df.index) + + if negate: + result = ~result + + return result + + def _df_get_masked_df( + self, + df: pd.DataFrame, + index_col: str, + mask: PandasMaskLike | None = None, + columns: list[str] | None = None, + negate: bool = False, + ) -> pd.DataFrame: + b_mask = self._df_get_bool_mask(df, index_col, mask, negate) + if columns: + return df.loc[b_mask, columns] + return df.loc[b_mask] + + def _df_iterator(self, df: pd.DataFrame) -> Iterator[dict[str, Any]]: + for index, row in df.iterrows(): + row_dict = row.to_dict() + row_dict["unique_id"] = index + yield row_dict + + def _df_remove( + self, + df: pd.DataFrame, + ids: Sequence[Any], + index_col: str | None = None, + ) -> pd.DataFrame: + return df[~df.index.isin(ids)] + + def _df_sample( + self, + df: pd.DataFrame, + n: int | None = None, + frac: float | None = None, + with_replacement: bool = False, + shuffle: bool = False, + seed: int | None = None, + ) -> pd.DataFrame: + return df.sample( + n=n, frac=frac, replace=with_replacement, shuffle=shuffle, random_state=seed + ) + + def _srs_constructor( + self, + data: Sequence[Sequence] | None = None, + name: str | None = None, + dtype: Any | None = None, + index: Sequence[Any] | None = None, + ) -> pd.Series: + return pd.Series(data, name=name, dtype=dtype, index=index) diff --git a/mesa_frames/concrete/agentset_polars.py b/mesa_frames/concrete/polars/agentset.py similarity index 98% rename from mesa_frames/concrete/agentset_polars.py rename to mesa_frames/concrete/polars/agentset.py index 358a310..a9ad914 100644 --- a/mesa_frames/concrete/agentset_polars.py +++ b/mesa_frames/concrete/polars/agentset.py @@ -6,14 +6,15 @@ from typing_extensions import Any, Self, overload from mesa_frames.concrete.agents import AgentSetDF -from mesa_frames.types import PolarsIdsLike, PolarsMaskLike +from mesa_frames.concrete.polars.mixin import PolarsMixin +from mesa_frames.types_ import PolarsIdsLike, PolarsMaskLike if TYPE_CHECKING: - from mesa_frames.concrete.agentset_pandas import AgentSetPandas from mesa_frames.concrete.model import ModelDF + from mesa_frames.concrete.pandas.agentset import AgentSetPandas -class AgentSetPolars(AgentSetDF): +class AgentSetPolars(AgentSetDF, PolarsMixin): _agents: pl.DataFrame _copy_with_method: dict[str, tuple[str, list[str]]] = { "_agents": ("clone", []), @@ -309,7 +310,7 @@ def sort( return obj def to_pandas(self) -> "AgentSetPandas": - from mesa_frames.concrete.agentset_pandas import AgentSetPandas + from mesa_frames.concrete.pandas.agentset import AgentSetPandas new_obj = AgentSetPandas(self._model) new_obj._agents = self._agents.to_pandas() diff --git a/mesa_frames/concrete/polars/mixin.py b/mesa_frames/concrete/polars/mixin.py new file mode 100644 index 0000000..e292281 --- /dev/null +++ b/mesa_frames/concrete/polars/mixin.py @@ -0,0 +1,145 @@ +import polars as pl +from typing_extensions import Any +from typing import Literal +from collections.abc import Collection, Iterator, Sequence + +from mesa_frames.abstract.mixin import DataFrameMixin +from mesa_frames.types_ import PolarsMaskLike + + +class PolarsMixin(DataFrameMixin): + # TODO: complete with other dtypes + _dtypes_mapping: dict[str, Any] = {"int64": pl.Int64, "bool": pl.Boolean} + + def _df_add_columns( + self, original_df: pl.DataFrame, new_columns: list[str], data: Any + ) -> pl.DataFrame: + return original_df.with_columns( + **{col: value for col, value in zip(new_columns, data)} + ) + + def _df_combine_first( + self, original_df: pl.DataFrame, new_df: pl.DataFrame, index_cols: list[str] + ) -> pl.DataFrame: + new_df = original_df.join(new_df, on=index_cols, how="full", suffix="_right") + # Find columns with the _right suffix and update the corresponding original columns + updated_columns = [] + for col in new_df.columns: + if col.endswith("_right"): + original_col = col.replace("_right", "") + updated_columns.append( + pl.when(pl.col(col).is_not_null()) + .then(pl.col(col)) + .otherwise(pl.col(original_col)) + .alias(original_col) + ) + + # Apply the updates and remove the _right columns + new_df = new_df.with_columns(updated_columns).select( + pl.col(r"^(?!.*_right$).*") + ) + return new_df + + def _df_concat( + self, + dfs: Collection[pl.DataFrame], + how: Literal["horizontal"] | Literal["vertical"] = "vertical", + ignore_index: bool = False, + ) -> pl.DataFrame: + return pl.concat( + dfs, how="vertical_relaxed" if how == "vertical" else "horizontal_relaxed" + ) + + def _df_constructor( + self, + data: Sequence[Sequence] | dict[str | Any] | None = None, + columns: list[str] | None = None, + index_col: str | list[str] | None = None, + dtypes: dict[str, str] | None = None, + ) -> pl.DataFrame: + dtypes = {k: self._dtypes_mapping.get(v, v) for k, v in dtypes.items()} + return pl.DataFrame(data=data, schema=dtypes if dtypes else columns) + + def _df_get_bool_mask( + self, + df: pl.DataFrame, + index_col: str, + mask: PolarsMaskLike = None, + negate: bool = False, + ) -> pl.Series | pl.Expr: + def bool_mask_from_series(mask: pl.Series) -> pl.Series: + if ( + isinstance(mask, pl.Series) + and mask.dtype == pl.Boolean + and len(mask) == len(df) + ): + return mask + return df[index_col].is_in(mask) + + if isinstance(mask, pl.Expr): + result = mask + elif isinstance(mask, pl.Series): + result = bool_mask_from_series(mask) + elif isinstance(mask, pl.DataFrame): + if index_col in mask.columns: + result = bool_mask_from_series(mask[index_col]) + elif len(mask.columns) == 1 and mask.dtypes[0] == pl.Boolean: + result = bool_mask_from_series(mask[mask.columns[0]]) + else: + raise KeyError( + f"DataFrame must have an {index_col} column or a single boolean column." + ) + elif mask is None or mask == "all": + result = pl.Series([True] * len(df)) + elif isinstance(mask, Collection): + result = bool_mask_from_series(pl.Series(mask)) + else: + result = bool_mask_from_series(pl.Series([mask])) + + if negate: + result = ~result + + return result + + def _df_get_masked_df( + self, + df: pl.DataFrame, + index_col: str, + mask: PolarsMaskLike | None = None, + columns: list[str] | None = None, + negate: bool = False, + ) -> pl.DataFrame: + b_mask = self._df_get_bool_mask(df, index_col, mask, negate=negate) + if columns: + return df.filter(b_mask)[columns] + return df.filter(b_mask) + + def _df_iterator(self, df: pl.DataFrame) -> Iterator[dict[str, Any]]: + return iter(df.iter_rows(named=True)) + + def _df_remove( + self, df: pl.DataFrame, ids: Sequence[Any], index_col: str | None = None + ) -> pl.DataFrame: + return df.filter(pl.col(index_col).is_in(ids).not_()) + + def _df_sample( + self, + df: pl.DataFrame, + n: int | None = None, + frac: float | None = None, + with_replacement: bool = False, + shuffle: bool = False, + seed: int | None = None, + ) -> pl.DataFrame: + return df.sample( + n=n, frac=frac, replace=with_replacement, shuffle=shuffle, seed=seed + ) + + def _srs_constructor( + self, + data: Sequence[Any] | None = None, + name: str | None = None, + dtype: Any | None = None, + index: Sequence[Any] | None = None, + ) -> pl.Series: + return pl.Series(name=name, values=data, dtype=self._dtypes_mapping[dtype]) diff --git a/mesa_frames/types.py b/mesa_frames/types_.py similarity index 97% rename from mesa_frames/types.py rename to mesa_frames/types_.py index 232aa0f..c34e792 100644 --- a/mesa_frames/types.py +++ b/mesa_frames/types_.py @@ -29,4 +29,6 @@ BoolSeries = pd.Series | pl.Series MaskLike = AgnosticMask | PandasMaskLike | PolarsMaskLike IdsLike = AgnosticIds | PandasIdsLike | PolarsIdsLike + +###----- Time ------### TimeT = float | int diff --git a/tests/test_agents.py b/tests/test_agents.py index 59d7be1..f1886b1 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -6,7 +6,7 @@ from mesa_frames import AgentsDF, ModelDF from mesa_frames.abstract.agents import AgentSetDF -from mesa_frames.types import MaskLike +from mesa_frames.types_ import MaskLike from tests.test_agentset_pandas import ( ExampleAgentSetPandas, fix1_AgentSetPandas,