Files
MLPproject/.venv/lib/python3.12/site-packages/xgboost/testing/data_iter.py
2025-10-23 15:44:32 +02:00

175 lines
4.8 KiB
Python

"""Tests related to the `DataIter` interface."""
from typing import Callable, Optional
import numpy as np
from xgboost import testing as tm
from ..compat import import_cupy
from ..core import DataIter, DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
from .utils import predictor_equal
def run_mixed_sparsity(device: str) -> None:
"""Check QDM with mixed batches."""
X_0, y_0, _ = tm.make_regression(128, 16, False)
if device.startswith("cuda"):
X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
else:
X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, False)
X_2, y_2 = tm.make_sparse_regression(512, 16, 0.9, True)
X = [X_0, X_1, X_2]
y = [y_0, y_1, y_2]
if device.startswith("cuda"):
cp = import_cupy()
X = [cp.array(batch) for batch in X]
it = tm.IteratorForTest(X, y, None, cache=None, on_host=False)
Xy_0 = QuantileDMatrix(it)
X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
X = [X_0, X_1, X_2]
y = [y_0, y_1, y_2]
X_arr = np.concatenate(X, axis=0)
y_arr = np.concatenate(y, axis=0)
Xy_1 = QuantileDMatrix(X_arr, y_arr)
assert predictor_equal(Xy_0, Xy_1)
def check_invalid_cat_batches(device: str) -> None:
"""Check error message for inconsistent feature types."""
class _InvalidCatIter(DataIter):
def __init__(self) -> None:
super().__init__(cache_prefix=None)
self._it = 0
def next(self, input_data: Callable) -> bool:
if self._it == 2:
return False
X, y = tm.make_categorical(
64,
12,
4,
onehot=False,
sparsity=0.5,
cat_ratio=1.0 if self._it == 0 else 0.5,
)
if device == "cuda":
import cudf
import cupy
X = cudf.DataFrame(X)
y = cupy.array(y)
input_data(data=X, label=y)
self._it += 1
return True
def reset(self) -> None:
self._it = 0
it = _InvalidCatIter()
import pytest
with pytest.raises(ValueError, match="Inconsistent number of categories between"):
ExtMemQuantileDMatrix(it, enable_categorical=True)
with pytest.raises(ValueError, match="Inconsistent number of categories between"):
QuantileDMatrix(it, enable_categorical=True)
with pytest.raises(ValueError, match="Inconsistent feature types"):
DMatrix(it, enable_categorical=True)
def check_uneven_sizes(device: str) -> None:
"""Tests for having irregular data shapes."""
batches = [
tm.make_regression(n_samples, 16, use_cupy=device == "cuda")
for n_samples in [512, 256, 1024]
]
unzip = list(zip(*batches))
it = tm.IteratorForTest(unzip[0], unzip[1], None, cache="cache", on_host=True)
Xy = DMatrix(it)
assert Xy.num_col() == 16
assert Xy.num_row() == sum(x.shape[0] for x in unzip[0])
Xy = ExtMemQuantileDMatrix(it)
assert Xy.num_col() == 16
assert Xy.num_row() == sum(x.shape[0] for x in unzip[0])
class CatIter(DataIter): # pylint: disable=too-many-instance-attributes
"""An iterator for testing categorical features."""
def __init__( # pylint: disable=too-many-arguments,too-many-locals
self,
n_samples_per_batch: int,
n_features: int,
*,
n_batches: int,
n_cats: int,
sparsity: float,
cat_ratio: float,
onehot: bool,
device: str,
cache: Optional[str],
) -> None:
super().__init__(cache_prefix=cache)
self.n_batches = n_batches
self.device = device
n_samples = n_samples_per_batch * n_batches
cat, y = tm.make_categorical(
n_samples,
n_features,
n_categories=n_cats,
onehot=onehot,
cat_ratio=cat_ratio,
sparsity=sparsity,
)
xs, ys = [], []
prev = 0
for _ in range(n_batches):
n = min(n_samples_per_batch, n_samples - prev)
X = cat.iloc[prev : prev + n, :]
xs.append(X)
ys.append(y[prev : prev + n])
prev += n_samples_per_batch
self.xs = xs
self.ys = ys
self.x = cat
self.y = y
self._it = 0
def xy(self) -> tuple:
"""Return the concatenated data."""
return self.x, self.y
def next(self, input_data: Callable) -> bool:
if self._it == self.n_batches:
return False
X, y = self.xs[self._it], self.ys[self._it]
if self.device == "cuda":
import cudf
import cupy
X = cudf.DataFrame(X)
y = cupy.array(y)
input_data(data=X, label=y)
self._it += 1
return True
def reset(self) -> None:
self._it = 0