Skip to content

Commit

Permalink
Add datagen for testing string-based categorical data. (#11114)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Dec 19, 2024
1 parent dc092ae commit 24e19e7
Show file tree
Hide file tree
Showing 8 changed files with 176 additions and 108 deletions.
6 changes: 1 addition & 5 deletions python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,11 +995,7 @@ def _from_uri(
_warn_unused_missing(data, missing)
handle = ctypes.c_void_p()
data = os.fspath(os.path.expanduser(data))
args = {
"uri": str(data),
"data_split_mode": int(data_split_mode),
}
config = bytes(json.dumps(args), "utf-8")
config = make_jcargs(uri=str(data), data_split_mode=int(data_split_mode))
_check_call(_LIB.XGDMatrixCreateFromURI(config, ctypes.byref(handle)))
return handle, feature_names, feature_types

Expand Down
84 changes: 4 additions & 80 deletions python-package/xgboost/testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,19 @@
import xgboost as xgb
from xgboost import RabitTracker
from xgboost.core import ArrayLike
from xgboost.data import is_pd_cat_dtype
from xgboost.sklearn import SklObjective
from xgboost.testing.data import (

from .._typing import PathLike
from .data import (
get_california_housing,
get_cancer,
get_digits,
get_sparse,
make_batches,
make_categorical,
make_sparse_regression,
memory,
)

from .._typing import PathLike

hypothesis = pytest.importorskip("hypothesis")

# pylint:disable=wrong-import-position,wrong-import-order
Expand Down Expand Up @@ -377,81 +376,6 @@ def __repr__(self) -> str:
return self.name


# pylint: disable=too-many-arguments,too-many-locals
@memory.cache
def make_categorical(
n_samples: int,
n_features: int,
n_categories: int,
*,
onehot: bool,
sparsity: float = 0.0,
cat_ratio: float = 1.0,
shuffle: bool = False,
random_state: int = 1994,
) -> Tuple[ArrayLike, np.ndarray]:
"""Generate categorical features for test.
Parameters
----------
n_categories:
Number of categories for categorical features.
onehot:
Should we apply one-hot encoding to the data?
sparsity:
The ratio of the amount of missing values over the number of all entries.
cat_ratio:
The ratio of features that are categorical.
shuffle:
Whether we should shuffle the columns.
Returns
-------
X, y
"""
import pandas as pd

rng = np.random.RandomState(random_state)

pd_dict = {}
for i in range(n_features + 1):
c = rng.randint(low=0, high=n_categories, size=n_samples)
pd_dict[str(i)] = pd.Series(c, dtype=np.int64)

df = pd.DataFrame(pd_dict)
label = df.iloc[:, 0]
df = df.iloc[:, 1:]
for i in range(0, n_features):
label += df.iloc[:, i]
label += 1

categories = np.arange(0, n_categories)
for col in df.columns:
if rng.binomial(1, cat_ratio, size=1)[0] == 1:
df[col] = df[col].astype("category")
df[col] = df[col].cat.set_categories(categories)

if sparsity > 0.0:
for i in range(n_features):
index = rng.randint(
low=0, high=n_samples - 1, size=int(n_samples * sparsity)
)
df.iloc[index, i] = np.nan
if is_pd_cat_dtype(df.dtypes.iloc[i]):
assert n_categories == np.unique(df.dtypes.iloc[i].categories).size

assert df.shape[1] == n_features
if onehot:
df = pd.get_dummies(df)

if shuffle:
columns = list(df.columns)
rng.shuffle(columns)
df = df[columns]

return df, label


def make_ltr(
n_samples: int,
n_features: int,
Expand Down
135 changes: 119 additions & 16 deletions python-package/xgboost/testing/data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# pylint: disable=invalid-name
# pylint: disable=invalid-name, too-many-lines
"""Utilities for data generation."""
import multiprocessing
import os
import string
import zipfile
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
Expand All @@ -14,6 +15,7 @@
List,
NamedTuple,
Optional,
Set,
Tuple,
Type,
Union,
Expand All @@ -26,8 +28,10 @@
from numpy.random import Generator as RNG
from scipy import sparse

import xgboost
from xgboost.data import pandas_pyarrow_mapper
from ..core import DMatrix, QuantileDMatrix
from ..data import is_pd_cat_dtype, pandas_pyarrow_mapper
from ..sklearn import ArrayLike, XGBRanker
from ..training import train as train_fn

if TYPE_CHECKING:
from ..compat import DataFrame as DataFrameT
Expand All @@ -42,7 +46,7 @@ def np_dtypes(
n_samples: int, n_features: int
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
"""Enumerate all supported dtypes from numpy."""
import pandas as pd
pd = pytest.importorskip("pandas")

rng = np.random.RandomState(1994)
# Integer and float.
Expand Down Expand Up @@ -99,7 +103,7 @@ def np_dtypes(

def pd_dtypes() -> Generator:
"""Enumerate all supported pandas extension types."""
import pandas as pd
pd = pytest.importorskip("pandas")

# Integer
dtypes = [
Expand Down Expand Up @@ -162,8 +166,8 @@ def pd_dtypes() -> Generator:

def pd_arrow_dtypes() -> Generator:
"""Pandas DataFrame with pyarrow backed type."""
import pandas as pd
import pyarrow as pa
pd = pytest.importorskip("pandas")
pa = pytest.importorskip("pyarrow")

# Integer
dtypes = pandas_pyarrow_mapper
Expand Down Expand Up @@ -225,10 +229,10 @@ def check_inf(rng: RNG) -> None:
X[5, 2] = np.inf

with pytest.raises(ValueError, match="Input data contains `inf`"):
xgboost.QuantileDMatrix(X, y)
QuantileDMatrix(X, y)

with pytest.raises(ValueError, match="Input data contains `inf`"):
xgboost.DMatrix(X, y)
DMatrix(X, y)


@memory.cache
Expand Down Expand Up @@ -288,8 +292,10 @@ def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]:
Number of categorical features: 10
Number of numerical features: 10
"""
pytest.importorskip("pandas")
import pandas as pd
if TYPE_CHECKING:
import pandas as pd
else:
pd = pytest.importorskip("pandas")

rng = np.random.default_rng(1994)
n_samples = 1460
Expand Down Expand Up @@ -664,7 +670,7 @@ def init_rank_score(
y_train = y_train[sorted_idx]
qid_train = qid_train[sorted_idx]

ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist")
ltr = XGBRanker(objective="rank:ndcg", tree_method="hist")
ltr.fit(X_train, y_train, qid=qid_train)

# Use the original order of the data.
Expand Down Expand Up @@ -799,9 +805,7 @@ def sort_ltr_samples(
return data


def run_base_margin_info(
DType: Callable, DMatrixT: Type[xgboost.DMatrix], device: str
) -> None:
def run_base_margin_info(DType: Callable, DMatrixT: Type[DMatrix], device: str) -> None:
"""Run tests for base margin."""
rng = np.random.default_rng()
X = DType(rng.normal(0, 1.0, size=100).astype(np.float32).reshape(50, 2))
Expand All @@ -814,7 +818,7 @@ def run_base_margin_info(
Xy = DMatrixT(X, y, base_margin=base_margin)
# Error at train, caused by check in predictor.
with pytest.raises(ValueError, match=r".*base_margin.*"):
xgboost.train({"tree_method": "hist", "device": device}, Xy)
train_fn({"tree_method": "hist", "device": device}, Xy)

if not hasattr(X, "iloc"):
# column major matrix
Expand Down Expand Up @@ -932,3 +936,102 @@ def random_csc(t_id: int) -> sparse.csc_matrix:
return arr, y

return csr, y


def unique_random_strings(n_strings: int, seed: int) -> List[str]:
"""Generate n unique strings."""
name_len = 8 # hardcoded, should be more than enough
unique_strings: Set[str] = set()
rng = np.random.default_rng(seed)

while len(unique_strings) < n_strings:
random_str = "".join(
rng.choice(list(string.ascii_letters), size=name_len, replace=True)
)
unique_strings.add(random_str)

return list(unique_strings)


# pylint: disable=too-many-arguments,too-many-locals,too-many-branches
def make_categorical(
n_samples: int,
n_features: int,
n_categories: int,
*,
onehot: bool,
sparsity: float = 0.0,
cat_ratio: float = 1.0,
shuffle: bool = False,
random_state: int = 1994,
cat_dtype: np.typing.DTypeLike = np.int64,
) -> Tuple[ArrayLike, np.ndarray]:
"""Generate categorical features for test.
Parameters
----------
n_categories:
Number of categories for categorical features.
onehot:
Should we apply one-hot encoding to the data?
sparsity:
The ratio of the amount of missing values over the number of all entries.
cat_ratio:
The ratio of features that are categorical.
shuffle:
Whether we should shuffle the columns.
cat_dtype :
The dtype for categorical features, might be string or numeric.
Returns
-------
X, y
"""
pd = pytest.importorskip("pandas")

rng = np.random.RandomState(random_state)

df = pd.DataFrame()
for i in range(n_features):
choice = rng.binomial(1, cat_ratio, size=1)[0]
if choice == 1:
if np.issubdtype(cat_dtype, np.str_):
categories = np.array(unique_random_strings(n_categories, i))
c = rng.choice(categories, size=n_samples, replace=True)
else:
categories = np.arange(0, n_categories)
c = rng.randint(low=0, high=n_categories, size=n_samples)

df[str(i)] = pd.Series(c, dtype="category")
df[str(i)] = df[str(i)].cat.set_categories(categories)
else:
num = rng.randint(low=0, high=n_categories, size=n_samples)
df[str(i)] = pd.Series(num, dtype=num.dtype)

label = np.zeros(shape=(n_samples,))
for col in df.columns:
if isinstance(df[col].dtype, pd.CategoricalDtype):
label += df[col].cat.codes
else:
label += df[col]
label += 1

if sparsity > 0.0:
for i in range(n_features):
index = rng.randint(
low=0, high=n_samples - 1, size=int(n_samples * sparsity)
)
df.iloc[index, i] = np.nan
if is_pd_cat_dtype(df.dtypes.iloc[i]):
assert n_categories == np.unique(df.dtypes.iloc[i].categories).size

assert df.shape[1] == n_features
if onehot:
df = pd.get_dummies(df)

if shuffle:
columns = list(df.columns)
rng.shuffle(columns)
df = df[columns]

return df, label
35 changes: 34 additions & 1 deletion python-package/xgboost/testing/quantile_dmatrix.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""QuantileDMatrix related tests."""

import numpy as np
import pytest
from sklearn.model_selection import train_test_split

import xgboost as xgb

from .data import make_batches
from .data import make_batches, make_categorical


def check_ref_quantile_cut(device: str) -> None:
Expand Down Expand Up @@ -33,3 +34,35 @@ def check_ref_quantile_cut(device: str) -> None:
Xy_valid = xgb.QuantileDMatrix(X_valid, y_valid)
cut_valid = Xy_valid.get_quantile_cut()
assert not np.allclose(cut_train[1], cut_valid[1])


def check_categorical_strings(device: str) -> None:
"""Check string inputs."""
if device == "cpu":
pd = pytest.importorskip("pandas")
else:
pd = pytest.importorskip("cudf")

n_categories = 32
X, y = make_categorical(
1024,
8,
n_categories,
onehot=False,
cat_dtype=np.str_,
cat_ratio=0.5,
shuffle=True,
)
X = pd.DataFrame(X)

Xy = xgb.QuantileDMatrix(X, y, enable_categorical=True)
assert Xy.num_col() == 8
cuts = Xy.get_quantile_cut()
indptr = cuts[0]
values = cuts[1]
for i in range(1, len(indptr)):
f_idx = i - 1
if isinstance(X[X.columns[f_idx]].dtype, pd.CategoricalDtype):
beg, end = indptr[f_idx], indptr[i]
col = values[beg:end]
np.testing.assert_allclose(col, np.arange(0, n_categories))
Loading

0 comments on commit 24e19e7

Please sign in to comment.