69 lines
1.9 KiB
Python
69 lines
1.9 KiB
Python
"""QuantileDMatrix related tests."""
|
|
|
|
import numpy as np
|
|
import pytest
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
import xgboost as xgb
|
|
|
|
from .data import make_batches, make_categorical
|
|
|
|
|
|
def check_ref_quantile_cut(device: str) -> None:
|
|
"""Check obtaining the same cut values given a reference."""
|
|
X, y, _ = (
|
|
data[0]
|
|
for data in make_batches(
|
|
n_samples_per_batch=8192,
|
|
n_features=16,
|
|
n_batches=1,
|
|
use_cupy=device.startswith("cuda"),
|
|
)
|
|
)
|
|
|
|
X_train, X_valid, y_train, y_valid = train_test_split(X, y)
|
|
Xy_train = xgb.QuantileDMatrix(X_train, y_train)
|
|
Xy_valid = xgb.QuantileDMatrix(X_valid, y_valid, ref=Xy_train)
|
|
|
|
cut_train = Xy_train.get_quantile_cut()
|
|
cut_valid = Xy_valid.get_quantile_cut()
|
|
|
|
np.testing.assert_allclose(cut_train[0], cut_valid[0])
|
|
np.testing.assert_allclose(cut_train[1], cut_valid[1])
|
|
|
|
Xy_valid = xgb.QuantileDMatrix(X_valid, y_valid)
|
|
cut_valid = Xy_valid.get_quantile_cut()
|
|
assert not np.allclose(cut_train[1], cut_valid[1])
|
|
|
|
|
|
def check_categorical_strings(device: str) -> None:
|
|
"""Check string inputs."""
|
|
if device == "cpu":
|
|
pd = pytest.importorskip("pandas")
|
|
else:
|
|
pd = pytest.importorskip("cudf")
|
|
|
|
n_categories = 32
|
|
X, y = make_categorical(
|
|
1024,
|
|
8,
|
|
n_categories,
|
|
onehot=False,
|
|
cat_dtype=np.str_,
|
|
cat_ratio=0.5,
|
|
shuffle=True,
|
|
)
|
|
X = pd.DataFrame(X)
|
|
|
|
Xy = xgb.QuantileDMatrix(X, y, enable_categorical=True)
|
|
assert Xy.num_col() == 8
|
|
cuts = Xy.get_quantile_cut()
|
|
indptr = cuts[0]
|
|
values = cuts[1]
|
|
for i in range(1, len(indptr)):
|
|
f_idx = i - 1
|
|
if isinstance(X[X.columns[f_idx]].dtype, pd.CategoricalDtype):
|
|
beg, end = indptr[f_idx], indptr[i]
|
|
col = values[beg:end]
|
|
np.testing.assert_allclose(col, np.arange(0, n_categories))
|