462 lines
14 KiB
Python
462 lines
14 KiB
Python
# pylint: disable=too-many-arguments
|
|
"""Copyright 2019-2025, XGBoost contributors"""
|
|
|
|
import logging
|
|
from collections.abc import Sequence
|
|
from typing import (
|
|
Any,
|
|
Callable,
|
|
Dict,
|
|
List,
|
|
Optional,
|
|
Tuple,
|
|
TypeVar,
|
|
Union,
|
|
cast,
|
|
overload,
|
|
)
|
|
|
|
import dask
|
|
import distributed
|
|
import numpy as np
|
|
import pandas as pd
|
|
from dask import dataframe as dd
|
|
|
|
from .. import collective as coll
|
|
from .._data_utils import Categories
|
|
from .._typing import FeatureNames, FeatureTypes
|
|
from ..compat import concat, import_cupy
|
|
from ..core import Booster, DataIter, DMatrix, QuantileDMatrix
|
|
from ..data import is_on_cuda
|
|
from ..sklearn import get_model_categories, pick_ref_categories
|
|
from ..training import _RefError
|
|
|
|
LOGGER = logging.getLogger("[xgboost.dask]")
|
|
|
|
_DataParts = List[Dict[str, Any]]
|
|
|
|
|
|
meta = [
|
|
"label",
|
|
"weight",
|
|
"base_margin",
|
|
"qid",
|
|
"label_lower_bound",
|
|
"label_upper_bound",
|
|
]
|
|
|
|
|
|
class DaskPartitionIter(DataIter): # pylint: disable=R0902
|
|
"""A data iterator for the `DaskQuantileDMatrix`."""
|
|
|
|
def __init__(
|
|
self,
|
|
data: List[Any],
|
|
feature_names: Optional[FeatureNames] = None,
|
|
feature_types: Optional[Union[FeatureTypes, Categories]] = None,
|
|
feature_weights: Optional[Any] = None,
|
|
**kwargs: Optional[List[Any]],
|
|
) -> None:
|
|
types = (Sequence, type(None))
|
|
# Samples
|
|
self._data = data
|
|
for k in meta:
|
|
setattr(self, k, kwargs.get(k, None))
|
|
assert isinstance(getattr(self, k), types)
|
|
|
|
# Feature info
|
|
self._feature_names = feature_names
|
|
self._feature_types = feature_types
|
|
self._feature_weights = feature_weights
|
|
|
|
assert isinstance(self._data, Sequence)
|
|
|
|
self._iter = 0 # set iterator to 0
|
|
super().__init__(release_data=True)
|
|
|
|
def _get(self, attr: str) -> Optional[Any]:
|
|
if getattr(self, attr) is not None:
|
|
return getattr(self, attr)[self._iter]
|
|
return None
|
|
|
|
def data(self) -> Any:
|
|
"""Utility function for obtaining current batch of data."""
|
|
return self._data[self._iter]
|
|
|
|
def reset(self) -> None:
|
|
"""Reset the iterator"""
|
|
self._iter = 0
|
|
|
|
def next(self, input_data: Callable) -> bool:
|
|
"""Yield next batch of data"""
|
|
if self._iter == len(self._data):
|
|
# Return False when there's no more batch.
|
|
return False
|
|
|
|
kwargs = {k: self._get(k) for k in meta}
|
|
input_data(
|
|
data=self.data(),
|
|
group=None,
|
|
feature_names=self._feature_names,
|
|
feature_types=self._feature_types,
|
|
feature_weights=self._feature_weights,
|
|
**kwargs,
|
|
)
|
|
self._iter += 1
|
|
return True
|
|
|
|
|
|
@overload
|
|
def _add_column(df: dd.DataFrame, col: dd.Series) -> Tuple[dd.DataFrame, str]: ...
|
|
|
|
|
|
@overload
|
|
def _add_column(df: dd.DataFrame, col: None) -> Tuple[dd.DataFrame, None]: ...
|
|
|
|
|
|
def _add_column(
|
|
df: dd.DataFrame, col: Optional[dd.Series]
|
|
) -> Tuple[dd.DataFrame, Optional[str]]:
|
|
if col is None:
|
|
return df, col
|
|
|
|
trails = 0
|
|
uid = f"{col.name}_{trails}"
|
|
while uid in df.columns:
|
|
trails += 1
|
|
uid = f"{col.name}_{trails}"
|
|
|
|
df = df.assign(**{uid: col})
|
|
return df, uid
|
|
|
|
|
|
def no_group_split( # pylint: disable=too-many-positional-arguments
|
|
device: str | None,
|
|
df: dd.DataFrame,
|
|
qid: dd.Series,
|
|
y: dd.Series,
|
|
sample_weight: Optional[dd.Series],
|
|
base_margin: Optional[dd.Series],
|
|
) -> Tuple[
|
|
dd.DataFrame, dd.Series, dd.Series, Optional[dd.Series], Optional[dd.Series]
|
|
]:
|
|
"""A function to prevent query group from being scattered to different
|
|
workers. Please see the tutorial in the document for the implication for not having
|
|
partition boundary based on query groups.
|
|
|
|
"""
|
|
|
|
df, qid_uid = _add_column(df, qid)
|
|
df, y_uid = _add_column(df, y)
|
|
df, w_uid = _add_column(df, sample_weight)
|
|
df, bm_uid = _add_column(df, base_margin)
|
|
|
|
# `tasks` shuffle is required as of rapids 24.12
|
|
shuffle = "p2p" if device is None or device == "cpu" else "tasks"
|
|
with dask.config.set({"dataframe.shuffle.method": shuffle}):
|
|
df = df.persist()
|
|
# Encode the QID to make it dense.
|
|
df[qid_uid] = df[qid_uid].astype("category").cat.as_known().cat.codes
|
|
# The shuffle here is costly.
|
|
df = df.sort_values(by=qid_uid)
|
|
cnt = df.groupby(qid_uid)[qid_uid].count()
|
|
div = cnt.index.compute().values.tolist()
|
|
div = sorted(div)
|
|
div = tuple(div + [div[-1] + 1])
|
|
|
|
df = df.set_index(
|
|
qid_uid,
|
|
drop=False,
|
|
divisions=div,
|
|
).persist()
|
|
|
|
qid = df[qid_uid]
|
|
y = df[y_uid]
|
|
sample_weight, base_margin = (
|
|
cast(dd.Series, df[uid]) if uid is not None else None for uid in (w_uid, bm_uid)
|
|
)
|
|
|
|
uids = [uid for uid in [qid_uid, y_uid, w_uid, bm_uid] if uid is not None]
|
|
df = df.drop(uids, axis=1).persist()
|
|
return df, qid, y, sample_weight, base_margin
|
|
|
|
|
|
def sort_data_by_qid(**kwargs: List[Any]) -> Dict[str, List[Any]]:
|
|
"""Sort worker-local data by query ID for learning to rank tasks."""
|
|
data_parts = kwargs.get("data")
|
|
assert data_parts is not None
|
|
n_parts = len(data_parts)
|
|
|
|
if is_on_cuda(data_parts[0]):
|
|
from cudf import DataFrame
|
|
else:
|
|
from pandas import DataFrame
|
|
|
|
def get_dict(i: int) -> Dict[str, list]:
|
|
"""Return a dictionary containing all the meta info and all partitions."""
|
|
|
|
def _get(attr: Optional[List[Any]]) -> Optional[list]:
|
|
if attr is not None:
|
|
return attr[i]
|
|
return None
|
|
|
|
data_opt = {name: _get(kwargs.get(name, None)) for name in meta}
|
|
# Filter out None values.
|
|
data = {k: v for k, v in data_opt.items() if v is not None}
|
|
return data
|
|
|
|
def map_fn(i: int) -> pd.DataFrame:
|
|
data = get_dict(i)
|
|
return DataFrame(data)
|
|
|
|
meta_parts = [map_fn(i) for i in range(n_parts)]
|
|
dfq = concat(meta_parts)
|
|
if dfq.qid.is_monotonic_increasing:
|
|
return kwargs
|
|
|
|
LOGGER.warning(
|
|
"[r%d]: Sorting data with %d partitions for ranking. "
|
|
"This is a costly operation and will increase the memory usage significantly. "
|
|
"To avoid this warning, sort the data based on qid before passing it into "
|
|
"XGBoost. Alternatively, you can use set the `allow_group_split` to False.",
|
|
coll.get_rank(),
|
|
n_parts,
|
|
)
|
|
# I tried to construct a new dask DF to perform the sort, but it's quite difficult
|
|
# to get the partition alignment right. Along with the still maturing shuffle
|
|
# implementation and GPU compatibility, a simple concat is used.
|
|
#
|
|
# In case it might become useful one day, I managed to get a CPU version working,
|
|
# albeit qutie slow (much slower than concatenated sort). The implementation merges
|
|
# everything into a single Dask DF and runs `DF.sort_values`, then retrieve the
|
|
# individual X,y,qid, ... from calculated partition values `client.compute([p for p
|
|
# in df.partitions])`. It was to avoid creating mismatched partitions.
|
|
dfx = concat(data_parts)
|
|
|
|
if is_on_cuda(dfq):
|
|
cp = import_cupy()
|
|
sorted_idx = cp.argsort(dfq.qid)
|
|
else:
|
|
sorted_idx = np.argsort(dfq.qid)
|
|
dfq = dfq.iloc[sorted_idx, :]
|
|
|
|
if hasattr(dfx, "iloc"):
|
|
dfx = dfx.iloc[sorted_idx, :]
|
|
else:
|
|
dfx = dfx[sorted_idx, :]
|
|
|
|
kwargs.update({"data": [dfx]})
|
|
for i, c in enumerate(dfq.columns):
|
|
assert c in kwargs
|
|
kwargs.update({c: [dfq[c]]})
|
|
|
|
return kwargs
|
|
|
|
|
|
def _get_worker_parts(list_of_parts: _DataParts) -> Dict[str, List[Any]]:
|
|
"""Convert list of dictionaries into a dictionary of lists."""
|
|
assert isinstance(list_of_parts, list)
|
|
result: Dict[str, List[Any]] = {}
|
|
|
|
def append(i: int, name: str) -> None:
|
|
if name in list_of_parts[i]:
|
|
part = list_of_parts[i][name]
|
|
else:
|
|
part = None
|
|
if part is not None:
|
|
if name not in result:
|
|
result[name] = []
|
|
result[name].append(part)
|
|
|
|
for i, _ in enumerate(list_of_parts):
|
|
append(i, "data")
|
|
for k in meta:
|
|
append(i, k)
|
|
|
|
qid = result.get("qid", None)
|
|
if qid is not None:
|
|
result = sort_data_by_qid(**result)
|
|
return result
|
|
|
|
|
|
def _extract_data(
|
|
parts: _DataParts,
|
|
model: Optional[Booster],
|
|
feature_types: Optional[FeatureTypes],
|
|
xy_cats: Optional[Categories],
|
|
) -> Tuple[Dict[str, List[Any]], Optional[Union[FeatureTypes, Categories]]]:
|
|
unzipped_dict = _get_worker_parts(parts)
|
|
X = unzipped_dict["data"][0]
|
|
_, model_cats = get_model_categories(X, model, feature_types)
|
|
model_cats = pick_ref_categories(X, model_cats, xy_cats)
|
|
return unzipped_dict, model_cats
|
|
|
|
|
|
def _get_is_cuda(parts: Optional[_DataParts]) -> bool:
|
|
if parts is not None:
|
|
is_cuda = is_on_cuda(parts[0].get("data"))
|
|
else:
|
|
is_cuda = False
|
|
|
|
is_cuda = bool(coll.allreduce(np.array([is_cuda], dtype=np.int32), coll.Op.MAX)[0])
|
|
return is_cuda
|
|
|
|
|
|
def _make_empty(is_cuda: bool) -> np.ndarray:
|
|
if is_cuda:
|
|
cp = import_cupy()
|
|
empty = cp.empty((0, 0))
|
|
else:
|
|
empty = np.empty((0, 0))
|
|
return empty
|
|
|
|
|
|
def _warn_empty() -> None:
|
|
worker = distributed.get_worker()
|
|
LOGGER.warning("Worker %s has an empty DMatrix.", worker.address)
|
|
|
|
|
|
def _create_quantile_dmatrix(
|
|
*,
|
|
feature_names: Optional[FeatureNames],
|
|
feature_types: Optional[FeatureTypes],
|
|
feature_weights: Optional[Any],
|
|
missing: float,
|
|
nthread: int,
|
|
parts: Optional[_DataParts],
|
|
max_bin: int,
|
|
enable_categorical: bool,
|
|
max_quantile_batches: Optional[int],
|
|
ref: Optional[DMatrix] = None,
|
|
model: Optional[Booster],
|
|
Xy_cats: Optional[Categories],
|
|
) -> QuantileDMatrix:
|
|
is_cuda = _get_is_cuda(parts)
|
|
if parts is None:
|
|
_warn_empty()
|
|
return QuantileDMatrix(
|
|
_make_empty(is_cuda),
|
|
feature_names=feature_names,
|
|
feature_types=feature_types,
|
|
max_bin=max_bin,
|
|
ref=ref,
|
|
enable_categorical=enable_categorical,
|
|
max_quantile_batches=max_quantile_batches,
|
|
)
|
|
|
|
unzipped_dict, model_cats = _extract_data(parts, model, feature_types, Xy_cats)
|
|
|
|
return QuantileDMatrix(
|
|
DaskPartitionIter(
|
|
**unzipped_dict,
|
|
feature_types=model_cats,
|
|
feature_names=feature_names,
|
|
feature_weights=feature_weights,
|
|
),
|
|
missing=missing,
|
|
nthread=nthread,
|
|
max_bin=max_bin,
|
|
ref=ref,
|
|
enable_categorical=enable_categorical,
|
|
max_quantile_batches=max_quantile_batches,
|
|
)
|
|
|
|
|
|
def _create_dmatrix( # pylint: disable=too-many-locals
|
|
*,
|
|
feature_names: Optional[FeatureNames],
|
|
feature_types: Optional[FeatureTypes],
|
|
feature_weights: Optional[Any],
|
|
missing: float,
|
|
nthread: int,
|
|
enable_categorical: bool,
|
|
parts: Optional[_DataParts],
|
|
model: Optional[Booster],
|
|
Xy_cats: Optional[Categories],
|
|
) -> DMatrix:
|
|
"""Get data that local to worker from DaskDMatrix.
|
|
|
|
Returns
|
|
-------
|
|
A DMatrix object.
|
|
|
|
"""
|
|
is_cuda = _get_is_cuda(parts)
|
|
if parts is None:
|
|
_warn_empty()
|
|
return DMatrix(
|
|
_make_empty(is_cuda),
|
|
feature_names=feature_names,
|
|
feature_types=feature_types,
|
|
enable_categorical=enable_categorical,
|
|
)
|
|
|
|
T = TypeVar("T")
|
|
|
|
def concat_or_none(data: Sequence[Optional[T]]) -> Optional[T]:
|
|
if any(part is None for part in data):
|
|
return None
|
|
return concat(data)
|
|
|
|
unzipped_dict, model_cats = _extract_data(parts, model, feature_types, Xy_cats)
|
|
|
|
concated_dict: Dict[str, Any] = {}
|
|
for key, value in unzipped_dict.items():
|
|
v = concat_or_none(value)
|
|
concated_dict[key] = v
|
|
|
|
return DMatrix(
|
|
**concated_dict,
|
|
missing=missing,
|
|
feature_names=feature_names,
|
|
feature_types=model_cats,
|
|
nthread=nthread,
|
|
enable_categorical=enable_categorical,
|
|
feature_weights=feature_weights,
|
|
)
|
|
|
|
|
|
def _dmatrix_from_list_of_parts(is_quantile: bool, **kwargs: Any) -> DMatrix:
|
|
if is_quantile:
|
|
return _create_quantile_dmatrix(**kwargs)
|
|
return _create_dmatrix(**kwargs)
|
|
|
|
|
|
def _get_dmatrices(
|
|
train_ref: dict,
|
|
train_id: int,
|
|
*refs: dict,
|
|
evals_id: Sequence[int],
|
|
evals_name: Sequence[str],
|
|
n_threads: int,
|
|
model: Optional[Booster],
|
|
) -> Tuple[DMatrix, List[Tuple[DMatrix, str]]]:
|
|
# Create the training DMatrix
|
|
Xy = _dmatrix_from_list_of_parts(
|
|
**train_ref, nthread=n_threads, model=model, Xy_cats=None
|
|
)
|
|
|
|
# Create evaluation DMatrices
|
|
evals: List[Tuple[DMatrix, str]] = []
|
|
Xy_cats = Xy.get_categories()
|
|
|
|
for i, ref in enumerate(refs):
|
|
# Same DMatrix as the training
|
|
if evals_id[i] == train_id:
|
|
evals.append((Xy, evals_name[i]))
|
|
continue
|
|
# Check whether the training DMatrix has been used as a reference.
|
|
if ref.get("ref", None) is not None:
|
|
if ref["ref"] != train_id:
|
|
raise ValueError(_RefError)
|
|
del ref["ref"] # Avoid duplicated parameter in the next fn call.
|
|
eval_xy = _dmatrix_from_list_of_parts(
|
|
**ref, nthread=n_threads, ref=Xy, Xy_cats=Xy_cats, model=model
|
|
)
|
|
else:
|
|
eval_xy = _dmatrix_from_list_of_parts(
|
|
**ref, nthread=n_threads, Xy_cats=Xy_cats, model=model
|
|
)
|
|
evals.append((eval_xy, evals_name[i]))
|
|
return Xy, evals
|