Files
MLPproject/.venv/lib/python3.12/site-packages/xgboost/testing/dask.py
2025-10-23 15:44:32 +02:00

399 lines
12 KiB
Python

# pylint: disable=invalid-name
"""Tests for dask shared by different test modules."""
from typing import Any, List, Literal, Tuple, Type, cast
import numpy as np
import pandas as pd
from dask import array as da
from dask import dataframe as dd
from distributed import Client, get_worker
from packaging.version import parse as parse_version
from sklearn.datasets import make_classification
import xgboost as xgb
import xgboost.testing as tm
from xgboost.compat import concat
from xgboost.testing.updater import get_basescore
from .. import dask as dxgb
from .._typing import EvalsLog
from ..dask import _get_rabit_args
from ..dask.utils import _DASK_VERSION
from .data import make_batches
from .data import make_categorical as make_cat_local
from .ordinal import make_recoded
from .utils import Device, assert_allclose
def check_init_estimation_clf(
tree_method: str, device: Literal["cpu", "cuda"], client: Client
) -> None:
"""Test init estimation for classsifier."""
X, y = make_classification(n_samples=4096 * 2, n_features=32, random_state=1994)
clf = xgb.XGBClassifier(
n_estimators=1, max_depth=1, tree_method=tree_method, device=device
)
clf.fit(X, y)
base_score = get_basescore(clf)
dx = da.from_array(X).rechunk(chunks=(32, None))
dy = da.from_array(y).rechunk(chunks=(32,))
dclf = dxgb.DaskXGBClassifier(
n_estimators=1,
max_depth=1,
tree_method=tree_method,
device=device,
)
dclf.client = client
dclf.fit(dx, dy)
dbase_score = get_basescore(dclf)
np.testing.assert_allclose(base_score, dbase_score)
def check_init_estimation_reg(
tree_method: str, device: Literal["cpu", "cuda"], client: Client
) -> None:
"""Test init estimation for regressor."""
from sklearn.datasets import make_regression
# pylint: disable=unbalanced-tuple-unpacking
X, y = make_regression(n_samples=4096 * 2, n_features=32, random_state=1994)
reg = xgb.XGBRegressor(
n_estimators=1, max_depth=1, tree_method=tree_method, device=device
)
reg.fit(X, y)
base_score = get_basescore(reg)
dx = da.from_array(X).rechunk(chunks=(32, None))
dy = da.from_array(y).rechunk(chunks=(32,))
dreg = dxgb.DaskXGBRegressor(
n_estimators=1, max_depth=1, tree_method=tree_method, device=device
)
dreg.client = client
dreg.fit(dx, dy)
dbase_score = get_basescore(dreg)
np.testing.assert_allclose(base_score, dbase_score)
def check_init_estimation(
tree_method: str, device: Literal["cpu", "cuda"], client: Client
) -> None:
"""Test init estimation."""
check_init_estimation_reg(tree_method, device, client)
check_init_estimation_clf(tree_method, device, client)
def check_uneven_nan(
client: Client, tree_method: str, device: Literal["cpu", "cuda"], n_workers: int
) -> None:
"""Issue #9271, not every worker has missing value."""
assert n_workers >= 2
with client.as_current():
clf = dxgb.DaskXGBClassifier(tree_method=tree_method, device=device)
X = pd.DataFrame({"a": range(10000), "b": range(10000, 0, -1)})
y = pd.Series([*[0] * 5000, *[1] * 5000])
X.loc[:3000:1000, "a"] = np.nan
client.wait_for_workers(n_workers=n_workers)
clf.fit(
dd.from_pandas(X, npartitions=n_workers),
dd.from_pandas(y, npartitions=n_workers),
)
def check_external_memory( # pylint: disable=too-many-locals
worker_id: int,
n_workers: int,
device: str,
comm_args: dict,
is_qdm: bool,
) -> None:
"""Basic checks for distributed external memory."""
n_samples_per_batch = 32
n_features = 4
n_batches = 16
use_cupy = device != "cpu"
n_threads = get_worker().state.nthreads
with xgb.collective.CommunicatorContext(dmlc_communicator="rabit", **comm_args):
it = tm.IteratorForTest(
*make_batches(
n_samples_per_batch,
n_features,
n_batches,
use_cupy=use_cupy,
random_state=worker_id,
),
cache="cache",
)
if is_qdm:
Xy: xgb.DMatrix = xgb.ExtMemQuantileDMatrix(it, nthread=n_threads)
else:
Xy = xgb.DMatrix(it, nthread=n_threads)
results: EvalsLog = {}
xgb.train(
{"tree_method": "hist", "nthread": n_threads, "device": device},
Xy,
evals=[(Xy, "Train")],
num_boost_round=32,
evals_result=results,
)
assert tm.non_increasing(cast(List[float], results["Train"]["rmse"]))
lx, ly, lw = [], [], []
for i in range(n_workers):
x, y, w = make_batches(
n_samples_per_batch,
n_features,
n_batches,
use_cupy=use_cupy,
random_state=i,
)
lx.extend(x)
ly.extend(y)
lw.extend(w)
X = concat(lx)
yconcat = concat(ly)
wconcat = concat(lw)
if is_qdm:
Xy = xgb.QuantileDMatrix(X, yconcat, weight=wconcat, nthread=n_threads)
else:
Xy = xgb.DMatrix(X, yconcat, weight=wconcat, nthread=n_threads)
results_local: EvalsLog = {}
xgb.train(
{"tree_method": "hist", "nthread": n_threads, "device": device},
Xy,
evals=[(Xy, "Train")],
num_boost_round=32,
evals_result=results_local,
)
np.testing.assert_allclose(
results["Train"]["rmse"], results_local["Train"]["rmse"], rtol=1e-4
)
def get_rabit_args(client: Client, n_workers: int) -> Any:
"""Get RABIT collective communicator arguments for tests."""
return client.sync(_get_rabit_args, client, n_workers)
def get_client_workers(client: Client) -> List[str]:
"Get workers from a dask client."
kwargs = {"n_workers": -1} if _DASK_VERSION() >= parse_version("2025.4.0") else {}
workers = client.scheduler_info(**kwargs)["workers"]
return list(workers.keys())
def make_ltr( # pylint: disable=too-many-locals,too-many-arguments
client: Client,
n_samples: int,
n_features: int,
*,
n_query_groups: int,
max_rel: int,
device: str,
) -> Tuple[dd.DataFrame, dd.Series, dd.Series]:
"""Synthetic dataset for learning to rank."""
workers = get_client_workers(client)
n_samples_per_worker = n_samples // len(workers)
if device == "cpu":
from pandas import DataFrame as DF
else:
from cudf import DataFrame as DF
def make(n: int, seed: int) -> pd.DataFrame:
rng = np.random.default_rng(seed)
X, y = make_classification(
n, n_features, n_informative=n_features, n_redundant=0, n_classes=max_rel
)
qid = rng.integers(size=(n,), low=0, high=n_query_groups)
df = DF(X, columns=[f"f{i}" for i in range(n_features)])
df["qid"] = qid
df["y"] = y
return df
futures = []
i = 0
for k in range(0, n_samples, n_samples_per_worker):
fut = client.submit(
make, n=n_samples_per_worker, seed=k, workers=[workers[i % len(workers)]]
)
futures.append(fut)
i += 1
last = n_samples - (n_samples_per_worker * len(workers))
if last != 0:
fut = client.submit(make, n=last, seed=n_samples_per_worker * len(workers))
futures.append(fut)
meta = make(1, 0)
df = dd.from_delayed(futures, meta=meta)
assert isinstance(df, dd.DataFrame)
return df.drop(["qid", "y"], axis=1), df.y, df.qid
def check_no_group_split(client: Client, device: str) -> None:
"""Test for the allow_group_split parameter."""
X_tr, q_tr, y_tr = make_ltr(
client, 4096, 128, n_query_groups=4, max_rel=5, device=device
)
X_va, q_va, y_va = make_ltr(
client, 1024, 128, n_query_groups=4, max_rel=5, device=device
)
ltr = dxgb.DaskXGBRanker(
allow_group_split=False,
n_estimators=36,
device=device,
objective="rank:pairwise",
)
ltr.fit(
X_tr,
y_tr,
qid=q_tr,
eval_set=[(X_tr, y_tr), (X_va, y_va)],
eval_qid=[q_tr, q_va],
verbose=True,
)
assert ltr.n_features_in_ == 128
assert X_tr.shape[1] == ltr.n_features_in_ # no change
ndcg = ltr.evals_result()["validation_0"]["ndcg@32"]
assert tm.non_decreasing(ndcg[:16], tolerance=1e-2), ndcg
np.testing.assert_allclose(ndcg[-1], 1.0, rtol=1e-2)
def make_categorical( # pylint: disable=too-many-locals, too-many-arguments
client: Client,
n_samples: int,
n_features: int,
n_categories: int,
*,
onehot: bool = False,
cat_dtype: np.typing.DTypeLike = np.int64,
) -> Tuple[dd.DataFrame, dd.Series]:
"""Synthesize categorical data with dask."""
workers = get_client_workers(client)
n_workers = len(workers)
dfs = []
def pack(**kwargs: Any) -> dd.DataFrame:
X, y = make_cat_local(**kwargs)
X["label"] = y
return X
meta = pack(
n_samples=1,
n_features=n_features,
n_categories=n_categories,
onehot=False,
cat_dtype=cat_dtype,
)
for i, worker in enumerate(workers):
l_n_samples = min(
n_samples // n_workers, n_samples - i * (n_samples // n_workers)
)
# make sure there's at least one sample for testing empty DMatrix
if n_samples == 1 and i == 0:
l_n_samples = 1
future = client.submit(
pack,
n_samples=l_n_samples,
n_features=n_features,
n_categories=n_categories,
cat_dtype=cat_dtype,
onehot=False,
workers=[worker],
)
dfs.append(future)
df: dd.DataFrame = cast(dd.DataFrame, dd.from_delayed(dfs, meta=meta))
y = df["label"]
X = df[df.columns.difference(["label"])]
if onehot:
return dd.get_dummies(X), y
return X, y
# pylint: disable=too-many-locals
def run_recode(client: Client, device: Device) -> None:
"""Run re-coding test with the Dask interface."""
def create_dmatrix(
DMatrixT: Type[dxgb.DaskDMatrix], *args: Any, **kwargs: Any
) -> dxgb.DaskDMatrix:
if DMatrixT is dxgb.DaskQuantileDMatrix:
ref = kwargs.pop("ref", None)
return DMatrixT(*args, ref=ref, **kwargs)
kwargs.pop("ref", None)
return DMatrixT(*args, **kwargs)
def run(DMatrixT: Type[dxgb.DaskDMatrix]) -> None:
enc, reenc, y, _, _ = make_recoded(device, n_features=96)
to = get_client_workers(client)
denc, dreenc, dy = (
dd.from_pandas(enc, npartitions=8).persist(workers=to),
dd.from_pandas(reenc, npartitions=8).persist(workers=to),
da.from_array(y, chunks=(y.shape[0] // 8,)).persist(workers=to),
)
Xy = create_dmatrix(DMatrixT, client, denc, dy, enable_categorical=True)
Xy_valid = create_dmatrix(
DMatrixT, client, dreenc, dy, enable_categorical=True, ref=Xy
)
# Base model
results = dxgb.train(
client, {"device": device}, Xy, evals=[(Xy_valid, "Valid")]
)
# Training continuation
Xy = create_dmatrix(DMatrixT, client, denc, dy, enable_categorical=True)
Xy_valid = create_dmatrix(
DMatrixT, client, dreenc, dy, enable_categorical=True, ref=Xy
)
results_1 = dxgb.train(
client,
{"device": device},
Xy,
evals=[(Xy_valid, "Valid")],
xgb_model=results["booster"],
)
# Reversed training continuation
Xy = create_dmatrix(DMatrixT, client, dreenc, dy, enable_categorical=True)
Xy_valid = create_dmatrix(
DMatrixT, client, denc, dy, enable_categorical=True, ref=Xy
)
results_2 = dxgb.train(
client,
{"device": device},
Xy,
evals=[(Xy_valid, "Valid")],
xgb_model=results["booster"],
)
np.testing.assert_allclose(
results_1["history"]["Valid"]["rmse"], results_2["history"]["Valid"]["rmse"]
)
predt_0 = dxgb.inplace_predict(client, results, denc).compute()
predt_1 = dxgb.inplace_predict(client, results, dreenc).compute()
assert_allclose(device, predt_0, predt_1)
predt_0 = dxgb.predict(client, results, Xy).compute()
predt_1 = dxgb.predict(client, results, Xy_valid).compute()
assert_allclose(device, predt_0, predt_1)
for DMatrixT in [dxgb.DaskDMatrix, dxgb.DaskQuantileDMatrix]:
run(DMatrixT)