Files
MLPproject/.venv/lib/python3.12/site-packages/xgboost/testing/quantile_dmatrix.py
2025-10-23 15:44:32 +02:00

69 lines
1.9 KiB
Python

"""QuantileDMatrix related tests."""
import numpy as np
import pytest
from sklearn.model_selection import train_test_split
import xgboost as xgb
from .data import make_batches, make_categorical
def check_ref_quantile_cut(device: str) -> None:
"""Check obtaining the same cut values given a reference."""
X, y, _ = (
data[0]
for data in make_batches(
n_samples_per_batch=8192,
n_features=16,
n_batches=1,
use_cupy=device.startswith("cuda"),
)
)
X_train, X_valid, y_train, y_valid = train_test_split(X, y)
Xy_train = xgb.QuantileDMatrix(X_train, y_train)
Xy_valid = xgb.QuantileDMatrix(X_valid, y_valid, ref=Xy_train)
cut_train = Xy_train.get_quantile_cut()
cut_valid = Xy_valid.get_quantile_cut()
np.testing.assert_allclose(cut_train[0], cut_valid[0])
np.testing.assert_allclose(cut_train[1], cut_valid[1])
Xy_valid = xgb.QuantileDMatrix(X_valid, y_valid)
cut_valid = Xy_valid.get_quantile_cut()
assert not np.allclose(cut_train[1], cut_valid[1])
def check_categorical_strings(device: str) -> None:
"""Check string inputs."""
if device == "cpu":
pd = pytest.importorskip("pandas")
else:
pd = pytest.importorskip("cudf")
n_categories = 32
X, y = make_categorical(
1024,
8,
n_categories,
onehot=False,
cat_dtype=np.str_,
cat_ratio=0.5,
shuffle=True,
)
X = pd.DataFrame(X)
Xy = xgb.QuantileDMatrix(X, y, enable_categorical=True)
assert Xy.num_col() == 8
cuts = Xy.get_quantile_cut()
indptr = cuts[0]
values = cuts[1]
for i in range(1, len(indptr)):
f_idx = i - 1
if isinstance(X[X.columns[f_idx]].dtype, pd.CategoricalDtype):
beg, end = indptr[f_idx], indptr[i]
col = values[beg:end]
np.testing.assert_allclose(col, np.arange(0, n_categories))