MLPproject/.venv/lib/python3.12/site-packages/xgboost/dask/data.py

# pylint: disable=too-many-arguments
"""Copyright 2019-2025, XGBoost contributors"""

import logging
from collections.abc import Sequence
from typing import (
    Any,
    Callable,
    Dict,
    List,
    Optional,
    Tuple,
    TypeVar,
    Union,
    cast,
    overload,
)

import dask
import distributed
import numpy as np
import pandas as pd
from dask import dataframe as dd

from .. import collective as coll
from .._data_utils import Categories
from .._typing import FeatureNames, FeatureTypes
from ..compat import concat, import_cupy
from ..core import Booster, DataIter, DMatrix, QuantileDMatrix
from ..data import is_on_cuda
from ..sklearn import get_model_categories, pick_ref_categories
from ..training import _RefError

LOGGER = logging.getLogger("[xgboost.dask]")

_DataParts = List[Dict[str, Any]]


meta = [
    "label",
    "weight",
    "base_margin",
    "qid",
    "label_lower_bound",
    "label_upper_bound",
]


class DaskPartitionIter(DataIter):  # pylint: disable=R0902
    """A data iterator for the `DaskQuantileDMatrix`."""

    def __init__(
        self,
        data: List[Any],
        feature_names: Optional[FeatureNames] = None,
        feature_types: Optional[Union[FeatureTypes, Categories]] = None,
        feature_weights: Optional[Any] = None,
        **kwargs: Optional[List[Any]],
    ) -> None:
        types = (Sequence, type(None))
        # Samples
        self._data = data
        for k in meta:
            setattr(self, k, kwargs.get(k, None))
            assert isinstance(getattr(self, k), types)

        # Feature info
        self._feature_names = feature_names
        self._feature_types = feature_types
        self._feature_weights = feature_weights

        assert isinstance(self._data, Sequence)

        self._iter = 0  # set iterator to 0
        super().__init__(release_data=True)

    def _get(self, attr: str) -> Optional[Any]:
        if getattr(self, attr) is not None:
            return getattr(self, attr)[self._iter]
        return None

    def data(self) -> Any:
        """Utility function for obtaining current batch of data."""
        return self._data[self._iter]

    def reset(self) -> None:
        """Reset the iterator"""
        self._iter = 0

    def next(self, input_data: Callable) -> bool:
        """Yield next batch of data"""
        if self._iter == len(self._data):
            # Return False when there's no more batch.
            return False

        kwargs = {k: self._get(k) for k in meta}
        input_data(
            data=self.data(),
            group=None,
            feature_names=self._feature_names,
            feature_types=self._feature_types,
            feature_weights=self._feature_weights,
            **kwargs,
        )
        self._iter += 1
        return True


@overload
def _add_column(df: dd.DataFrame, col: dd.Series) -> Tuple[dd.DataFrame, str]: ...


@overload
def _add_column(df: dd.DataFrame, col: None) -> Tuple[dd.DataFrame, None]: ...


def _add_column(
    df: dd.DataFrame, col: Optional[dd.Series]
) -> Tuple[dd.DataFrame, Optional[str]]:
    if col is None:
        return df, col

    trails = 0
    uid = f"{col.name}_{trails}"
    while uid in df.columns:
        trails += 1
        uid = f"{col.name}_{trails}"

    df = df.assign(**{uid: col})
    return df, uid


def no_group_split(  # pylint: disable=too-many-positional-arguments
    device: str | None,
    df: dd.DataFrame,
    qid: dd.Series,
    y: dd.Series,
    sample_weight: Optional[dd.Series],
    base_margin: Optional[dd.Series],
) -> Tuple[
    dd.DataFrame, dd.Series, dd.Series, Optional[dd.Series], Optional[dd.Series]
]:
    """A function to prevent query group from being scattered to different
    workers. Please see the tutorial in the document for the implication for not having
    partition boundary based on query groups.

    """

    df, qid_uid = _add_column(df, qid)
    df, y_uid = _add_column(df, y)
    df, w_uid = _add_column(df, sample_weight)
    df, bm_uid = _add_column(df, base_margin)

    # `tasks` shuffle is required as of rapids 24.12
    shuffle = "p2p" if device is None or device == "cpu" else "tasks"
    with dask.config.set({"dataframe.shuffle.method": shuffle}):
        df = df.persist()
        # Encode the QID to make it dense.
        df[qid_uid] = df[qid_uid].astype("category").cat.as_known().cat.codes
        # The shuffle here is costly.
        df = df.sort_values(by=qid_uid)
        cnt = df.groupby(qid_uid)[qid_uid].count()
        div = cnt.index.compute().values.tolist()
        div = sorted(div)
        div = tuple(div + [div[-1] + 1])

        df = df.set_index(
            qid_uid,
            drop=False,
            divisions=div,
        ).persist()

    qid = df[qid_uid]
    y = df[y_uid]
    sample_weight, base_margin = (
        cast(dd.Series, df[uid]) if uid is not None else None for uid in (w_uid, bm_uid)
    )

    uids = [uid for uid in [qid_uid, y_uid, w_uid, bm_uid] if uid is not None]
    df = df.drop(uids, axis=1).persist()
    return df, qid, y, sample_weight, base_margin


def sort_data_by_qid(**kwargs: List[Any]) -> Dict[str, List[Any]]:
    """Sort worker-local data by query ID for learning to rank tasks."""
    data_parts = kwargs.get("data")
    assert data_parts is not None
    n_parts = len(data_parts)

    if is_on_cuda(data_parts[0]):
        from cudf import DataFrame
    else:
        from pandas import DataFrame

    def get_dict(i: int) -> Dict[str, list]:
        """Return a dictionary containing all the meta info and all partitions."""

        def _get(attr: Optional[List[Any]]) -> Optional[list]:
            if attr is not None:
                return attr[i]
            return None

        data_opt = {name: _get(kwargs.get(name, None)) for name in meta}
        # Filter out None values.
        data = {k: v for k, v in data_opt.items() if v is not None}
        return data

    def map_fn(i: int) -> pd.DataFrame:
        data = get_dict(i)
        return DataFrame(data)

    meta_parts = [map_fn(i) for i in range(n_parts)]
    dfq = concat(meta_parts)
    if dfq.qid.is_monotonic_increasing:
        return kwargs

    LOGGER.warning(
        "[r%d]: Sorting data with %d partitions for ranking. "
        "This is a costly operation and will increase the memory usage significantly. "
        "To avoid this warning, sort the data based on qid before passing it into "
        "XGBoost. Alternatively, you can use set the `allow_group_split` to False.",
        coll.get_rank(),
        n_parts,
    )
    # I tried to construct a new dask DF to perform the sort, but it's quite difficult
    # to get the partition alignment right. Along with the still maturing shuffle
    # implementation and GPU compatibility, a simple concat is used.
    #
    # In case it might become useful one day, I managed to get a CPU version working,
    # albeit qutie slow (much slower than concatenated sort). The implementation merges
    # everything into a single Dask DF and runs `DF.sort_values`, then retrieve the
    # individual X,y,qid, ... from calculated partition values `client.compute([p for p
    # in df.partitions])`. It was to avoid creating mismatched partitions.
    dfx = concat(data_parts)

    if is_on_cuda(dfq):
        cp = import_cupy()
        sorted_idx = cp.argsort(dfq.qid)
    else:
        sorted_idx = np.argsort(dfq.qid)
    dfq = dfq.iloc[sorted_idx, :]

    if hasattr(dfx, "iloc"):
        dfx = dfx.iloc[sorted_idx, :]
    else:
        dfx = dfx[sorted_idx, :]

    kwargs.update({"data": [dfx]})
    for i, c in enumerate(dfq.columns):
        assert c in kwargs
        kwargs.update({c: [dfq[c]]})

    return kwargs


def _get_worker_parts(list_of_parts: _DataParts) -> Dict[str, List[Any]]:
    """Convert list of dictionaries into a dictionary of lists."""
    assert isinstance(list_of_parts, list)
    result: Dict[str, List[Any]] = {}

    def append(i: int, name: str) -> None:
        if name in list_of_parts[i]:
            part = list_of_parts[i][name]
        else:
            part = None
        if part is not None:
            if name not in result:
                result[name] = []
            result[name].append(part)

    for i, _ in enumerate(list_of_parts):
        append(i, "data")
        for k in meta:
            append(i, k)

    qid = result.get("qid", None)
    if qid is not None:
        result = sort_data_by_qid(**result)
    return result


def _extract_data(
    parts: _DataParts,
    model: Optional[Booster],
    feature_types: Optional[FeatureTypes],
    xy_cats: Optional[Categories],
) -> Tuple[Dict[str, List[Any]], Optional[Union[FeatureTypes, Categories]]]:
    unzipped_dict = _get_worker_parts(parts)
    X = unzipped_dict["data"][0]
    _, model_cats = get_model_categories(X, model, feature_types)
    model_cats = pick_ref_categories(X, model_cats, xy_cats)
    return unzipped_dict, model_cats


def _get_is_cuda(parts: Optional[_DataParts]) -> bool:
    if parts is not None:
        is_cuda = is_on_cuda(parts[0].get("data"))
    else:
        is_cuda = False

    is_cuda = bool(coll.allreduce(np.array([is_cuda], dtype=np.int32), coll.Op.MAX)[0])
    return is_cuda


def _make_empty(is_cuda: bool) -> np.ndarray:
    if is_cuda:
        cp = import_cupy()
        empty = cp.empty((0, 0))
    else:
        empty = np.empty((0, 0))
    return empty


def _warn_empty() -> None:
    worker = distributed.get_worker()
    LOGGER.warning("Worker %s has an empty DMatrix.", worker.address)


def _create_quantile_dmatrix(
    *,
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
    feature_weights: Optional[Any],
    missing: float,
    nthread: int,
    parts: Optional[_DataParts],
    max_bin: int,
    enable_categorical: bool,
    max_quantile_batches: Optional[int],
    ref: Optional[DMatrix] = None,
    model: Optional[Booster],
    Xy_cats: Optional[Categories],
) -> QuantileDMatrix:
    is_cuda = _get_is_cuda(parts)
    if parts is None:
        _warn_empty()
        return QuantileDMatrix(
            _make_empty(is_cuda),
            feature_names=feature_names,
            feature_types=feature_types,
            max_bin=max_bin,
            ref=ref,
            enable_categorical=enable_categorical,
            max_quantile_batches=max_quantile_batches,
        )

    unzipped_dict, model_cats = _extract_data(parts, model, feature_types, Xy_cats)

    return QuantileDMatrix(
        DaskPartitionIter(
            **unzipped_dict,
            feature_types=model_cats,
            feature_names=feature_names,
            feature_weights=feature_weights,
        ),
        missing=missing,
        nthread=nthread,
        max_bin=max_bin,
        ref=ref,
        enable_categorical=enable_categorical,
        max_quantile_batches=max_quantile_batches,
    )


def _create_dmatrix(  # pylint: disable=too-many-locals
    *,
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
    feature_weights: Optional[Any],
    missing: float,
    nthread: int,
    enable_categorical: bool,
    parts: Optional[_DataParts],
    model: Optional[Booster],
    Xy_cats: Optional[Categories],
) -> DMatrix:
    """Get data that local to worker from DaskDMatrix.

    Returns
    -------
    A DMatrix object.

    """
    is_cuda = _get_is_cuda(parts)
    if parts is None:
        _warn_empty()
        return DMatrix(
            _make_empty(is_cuda),
            feature_names=feature_names,
            feature_types=feature_types,
            enable_categorical=enable_categorical,
        )

    T = TypeVar("T")

    def concat_or_none(data: Sequence[Optional[T]]) -> Optional[T]:
        if any(part is None for part in data):
            return None
        return concat(data)

    unzipped_dict, model_cats = _extract_data(parts, model, feature_types, Xy_cats)

    concated_dict: Dict[str, Any] = {}
    for key, value in unzipped_dict.items():
        v = concat_or_none(value)
        concated_dict[key] = v

    return DMatrix(
        **concated_dict,
        missing=missing,
        feature_names=feature_names,
        feature_types=model_cats,
        nthread=nthread,
        enable_categorical=enable_categorical,
        feature_weights=feature_weights,
    )


def _dmatrix_from_list_of_parts(is_quantile: bool, **kwargs: Any) -> DMatrix:
    if is_quantile:
        return _create_quantile_dmatrix(**kwargs)
    return _create_dmatrix(**kwargs)


def _get_dmatrices(
    train_ref: dict,
    train_id: int,
    *refs: dict,
    evals_id: Sequence[int],
    evals_name: Sequence[str],
    n_threads: int,
    model: Optional[Booster],
) -> Tuple[DMatrix, List[Tuple[DMatrix, str]]]:
    # Create the training DMatrix
    Xy = _dmatrix_from_list_of_parts(
        **train_ref, nthread=n_threads, model=model, Xy_cats=None
    )

    # Create evaluation DMatrices
    evals: List[Tuple[DMatrix, str]] = []
    Xy_cats = Xy.get_categories()

    for i, ref in enumerate(refs):
        # Same DMatrix as the training
        if evals_id[i] == train_id:
            evals.append((Xy, evals_name[i]))
            continue
        # Check whether the training DMatrix has been used as a reference.
        if ref.get("ref", None) is not None:
            if ref["ref"] != train_id:
                raise ValueError(_RefError)
            del ref["ref"]  # Avoid duplicated parameter in the next fn call.
            eval_xy = _dmatrix_from_list_of_parts(
                **ref, nthread=n_threads, ref=Xy, Xy_cats=Xy_cats, model=model
            )
        else:
            eval_xy = _dmatrix_from_list_of_parts(
                **ref, nthread=n_threads, Xy_cats=Xy_cats, model=model
            )
        evals.append((eval_xy, evals_name[i]))
    return Xy, evals