175 lines
4.8 KiB
Python
175 lines
4.8 KiB
Python
"""Tests related to the `DataIter` interface."""
|
|
|
|
from typing import Callable, Optional
|
|
|
|
import numpy as np
|
|
|
|
from xgboost import testing as tm
|
|
|
|
from ..compat import import_cupy
|
|
from ..core import DataIter, DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
|
|
from .utils import predictor_equal
|
|
|
|
|
|
def run_mixed_sparsity(device: str) -> None:
|
|
"""Check QDM with mixed batches."""
|
|
X_0, y_0, _ = tm.make_regression(128, 16, False)
|
|
if device.startswith("cuda"):
|
|
X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
|
|
else:
|
|
X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, False)
|
|
X_2, y_2 = tm.make_sparse_regression(512, 16, 0.9, True)
|
|
X = [X_0, X_1, X_2]
|
|
y = [y_0, y_1, y_2]
|
|
|
|
if device.startswith("cuda"):
|
|
cp = import_cupy()
|
|
|
|
X = [cp.array(batch) for batch in X]
|
|
|
|
it = tm.IteratorForTest(X, y, None, cache=None, on_host=False)
|
|
Xy_0 = QuantileDMatrix(it)
|
|
|
|
X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
|
|
X = [X_0, X_1, X_2]
|
|
y = [y_0, y_1, y_2]
|
|
X_arr = np.concatenate(X, axis=0)
|
|
y_arr = np.concatenate(y, axis=0)
|
|
Xy_1 = QuantileDMatrix(X_arr, y_arr)
|
|
|
|
assert predictor_equal(Xy_0, Xy_1)
|
|
|
|
|
|
def check_invalid_cat_batches(device: str) -> None:
|
|
"""Check error message for inconsistent feature types."""
|
|
|
|
class _InvalidCatIter(DataIter):
|
|
def __init__(self) -> None:
|
|
super().__init__(cache_prefix=None)
|
|
self._it = 0
|
|
|
|
def next(self, input_data: Callable) -> bool:
|
|
if self._it == 2:
|
|
return False
|
|
X, y = tm.make_categorical(
|
|
64,
|
|
12,
|
|
4,
|
|
onehot=False,
|
|
sparsity=0.5,
|
|
cat_ratio=1.0 if self._it == 0 else 0.5,
|
|
)
|
|
if device == "cuda":
|
|
import cudf
|
|
import cupy
|
|
|
|
X = cudf.DataFrame(X)
|
|
y = cupy.array(y)
|
|
|
|
input_data(data=X, label=y)
|
|
self._it += 1
|
|
return True
|
|
|
|
def reset(self) -> None:
|
|
self._it = 0
|
|
|
|
it = _InvalidCatIter()
|
|
import pytest
|
|
|
|
with pytest.raises(ValueError, match="Inconsistent number of categories between"):
|
|
ExtMemQuantileDMatrix(it, enable_categorical=True)
|
|
|
|
with pytest.raises(ValueError, match="Inconsistent number of categories between"):
|
|
QuantileDMatrix(it, enable_categorical=True)
|
|
|
|
with pytest.raises(ValueError, match="Inconsistent feature types"):
|
|
DMatrix(it, enable_categorical=True)
|
|
|
|
|
|
def check_uneven_sizes(device: str) -> None:
|
|
"""Tests for having irregular data shapes."""
|
|
batches = [
|
|
tm.make_regression(n_samples, 16, use_cupy=device == "cuda")
|
|
for n_samples in [512, 256, 1024]
|
|
]
|
|
unzip = list(zip(*batches))
|
|
it = tm.IteratorForTest(unzip[0], unzip[1], None, cache="cache", on_host=True)
|
|
|
|
Xy = DMatrix(it)
|
|
assert Xy.num_col() == 16
|
|
assert Xy.num_row() == sum(x.shape[0] for x in unzip[0])
|
|
|
|
Xy = ExtMemQuantileDMatrix(it)
|
|
assert Xy.num_col() == 16
|
|
assert Xy.num_row() == sum(x.shape[0] for x in unzip[0])
|
|
|
|
|
|
class CatIter(DataIter): # pylint: disable=too-many-instance-attributes
|
|
"""An iterator for testing categorical features."""
|
|
|
|
def __init__( # pylint: disable=too-many-arguments,too-many-locals
|
|
self,
|
|
n_samples_per_batch: int,
|
|
n_features: int,
|
|
*,
|
|
n_batches: int,
|
|
n_cats: int,
|
|
sparsity: float,
|
|
cat_ratio: float,
|
|
onehot: bool,
|
|
device: str,
|
|
cache: Optional[str],
|
|
) -> None:
|
|
super().__init__(cache_prefix=cache)
|
|
self.n_batches = n_batches
|
|
self.device = device
|
|
|
|
n_samples = n_samples_per_batch * n_batches
|
|
cat, y = tm.make_categorical(
|
|
n_samples,
|
|
n_features,
|
|
n_categories=n_cats,
|
|
onehot=onehot,
|
|
cat_ratio=cat_ratio,
|
|
sparsity=sparsity,
|
|
)
|
|
xs, ys = [], []
|
|
|
|
prev = 0
|
|
for _ in range(n_batches):
|
|
n = min(n_samples_per_batch, n_samples - prev)
|
|
X = cat.iloc[prev : prev + n, :]
|
|
xs.append(X)
|
|
ys.append(y[prev : prev + n])
|
|
prev += n_samples_per_batch
|
|
|
|
self.xs = xs
|
|
self.ys = ys
|
|
|
|
self.x = cat
|
|
self.y = y
|
|
|
|
self._it = 0
|
|
|
|
def xy(self) -> tuple:
|
|
"""Return the concatenated data."""
|
|
return self.x, self.y
|
|
|
|
def next(self, input_data: Callable) -> bool:
|
|
if self._it == self.n_batches:
|
|
return False
|
|
|
|
X, y = self.xs[self._it], self.ys[self._it]
|
|
if self.device == "cuda":
|
|
import cudf
|
|
import cupy
|
|
|
|
X = cudf.DataFrame(X)
|
|
y = cupy.array(y)
|
|
input_data(data=X, label=y)
|
|
self._it += 1
|
|
return True
|
|
|
|
def reset(self) -> None:
|
|
self._it = 0
|