MLPproject/.venv/lib/python3.12/site-packages/xgboost/_data_utils.py

"""Helpers for interfacing array like objects."""

import copy
import ctypes
import json
from abc import ABC, abstractmethod
from functools import cache as fcache
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    List,
    Literal,
    Optional,
    Protocol,
    Tuple,
    Type,
    TypeAlias,
    TypedDict,
    TypeGuard,
    Union,
    cast,
    overload,
)

import numpy as np

from ._typing import (
    ArrowCatList,
    CNumericPtr,
    DataType,
    FeatureTypes,
    NumpyDType,
    NumpyOrCupy,
)
from .compat import import_cupy, import_pyarrow, lazy_isinstance

if TYPE_CHECKING:
    import pandas as pd
    import pyarrow as pa


# Used for accepting inputs for numpy and cupy arrays
class _ArrayLikeArg(Protocol):
    @property
    def __array_interface__(self) -> "ArrayInf": ...


class _CudaArrayLikeArg(Protocol):
    @property
    def __cuda_array_interface__(self) -> "CudaArrayInf": ...


ArrayInf = TypedDict(
    "ArrayInf",
    {
        "data": Tuple[int, bool],
        "typestr": str,
        "version": Literal[3],
        "strides": Optional[Tuple[int, ...]],
        "shape": Tuple[int, ...],
        "mask": Union["ArrayInf", None, _ArrayLikeArg],
    },
)

CudaArrayInf = TypedDict(
    "CudaArrayInf",
    {
        "data": Tuple[int, bool],
        "typestr": str,
        "version": Literal[3],
        "strides": Optional[Tuple[int, ...]],
        "shape": Tuple[int, ...],
        "mask": Union["ArrayInf", None, _ArrayLikeArg],
        "stream": int,
    },
)

StringArray = TypedDict("StringArray", {"offsets": ArrayInf, "values": ArrayInf})
CudaStringArray = TypedDict(
    "CudaStringArray", {"offsets": CudaArrayInf, "values": CudaArrayInf}
)


def array_hasobject(data: DataType) -> bool:
    """Whether the numpy array has object dtype."""
    return (
        hasattr(data, "dtype")
        and hasattr(data.dtype, "hasobject")
        and data.dtype.hasobject
    )


def cuda_array_interface_dict(data: _CudaArrayLikeArg) -> CudaArrayInf:
    """Returns a dictionary storing the CUDA array interface."""
    if array_hasobject(data):
        raise ValueError("Input data contains `object` dtype.  Expecting numeric data.")
    ainf = data.__cuda_array_interface__
    if "mask" in ainf:
        ainf["mask"] = ainf["mask"].__cuda_array_interface__  # type: ignore
    return ainf


def cuda_array_interface(data: _CudaArrayLikeArg) -> bytes:
    """Make cuda array interface str."""
    interface = cuda_array_interface_dict(data)
    interface_str = bytes(json.dumps(interface), "utf-8")
    return interface_str


def from_array_interface(interface: ArrayInf, zero_copy: bool = False) -> NumpyOrCupy:
    """Convert array interface to numpy or cupy array"""

    class Array:
        """Wrapper type for communicating with numpy and cupy."""

        _interface: Optional[ArrayInf] = None

        @property
        def __array_interface__(self) -> Optional[ArrayInf]:
            return self._interface

        @__array_interface__.setter
        def __array_interface__(self, interface: ArrayInf) -> None:
            self._interface = copy.copy(interface)
            # Convert some fields to tuple as required by numpy
            self._interface["shape"] = tuple(self._interface["shape"])
            self._interface["data"] = (
                self._interface["data"][0],
                self._interface["data"][1],
            )
            strides = self._interface.get("strides", None)
            if strides is not None:
                self._interface["strides"] = tuple(strides)

        @property
        def __cuda_array_interface__(self) -> Optional[ArrayInf]:
            return self.__array_interface__

        @__cuda_array_interface__.setter
        def __cuda_array_interface__(self, interface: ArrayInf) -> None:
            self.__array_interface__ = interface

    arr = Array()

    if "stream" in interface:
        # CUDA stream is presented, this is a __cuda_array_interface__.
        arr.__cuda_array_interface__ = interface
        out = import_cupy().array(arr, copy=not zero_copy)
    else:
        arr.__array_interface__ = interface
        out = np.array(arr, copy=not zero_copy)

    return out


# Default constant value for CUDA per-thread stream.
STREAM_PER_THREAD = 2


# Typing is not strict as there are subtle differences between CUDA array interface and
# array interface. We handle them uniformly for now.
def make_array_interface(
    ptr: Union[CNumericPtr, int],
    shape: Tuple[int, ...],
    dtype: Type[np.number],
    is_cuda: bool,
) -> ArrayInf:
    """Make an __(cuda)_array_interface__ from a pointer."""
    # Use an empty array to handle typestr and descr
    if is_cuda:
        empty = import_cupy().empty(shape=(0,), dtype=dtype)
        array = empty.__cuda_array_interface__  # pylint: disable=no-member
    else:
        empty = np.empty(shape=(0,), dtype=dtype)
        array = empty.__array_interface__  # pylint: disable=no-member

    if not isinstance(ptr, int):
        addr = ctypes.cast(ptr, ctypes.c_void_p).value
    else:
        addr = ptr
    length = int(np.prod(shape))
    # Handle empty dataset.
    assert addr is not None or length == 0

    if addr is None:
        return array

    array["data"] = (addr, True)
    if is_cuda and "stream" not in array:
        array["stream"] = STREAM_PER_THREAD
    array["shape"] = shape
    array["strides"] = None
    return array


def is_arrow_dict(data: Any) -> TypeGuard["pa.DictionaryArray"]:
    """Is this an arrow dictionary array?"""
    return lazy_isinstance(data, "pyarrow.lib", "DictionaryArray")


class DfCatAccessor(Protocol):
    """Protocol for pandas cat accessor."""

    @property
    def categories(  # pylint: disable=missing-function-docstring
        self,
    ) -> "pd.Index": ...

    @property
    def codes(self) -> "pd.Series": ...  # pylint: disable=missing-function-docstring

    @property
    def dtype(self) -> np.dtype: ...  # pylint: disable=missing-function-docstring

    @property
    def values(self) -> np.ndarray: ...  # pylint: disable=missing-function-docstring

    def to_arrow(  # pylint: disable=missing-function-docstring
        self,
    ) -> Union["pa.StringArray", "pa.IntegerArray"]: ...

    @property
    def __cuda_array_interface__(self) -> CudaArrayInf: ...

    @property
    def _column(self) -> Any: ...


def _is_df_cat(data: Any) -> TypeGuard[DfCatAccessor]:
    # Test pd.Series.cat, not pd.Series
    return hasattr(data, "categories") and hasattr(data, "codes")


@fcache
def _arrow_npdtype() -> Dict[Any, Type[np.number]]:
    import pyarrow as pa

    mapping: Dict[Any, Type[np.number]] = {
        pa.int8(): np.int8,
        pa.int16(): np.int16,
        pa.int32(): np.int32,
        pa.int64(): np.int64,
        pa.uint8(): np.uint8,
        pa.uint16(): np.uint16,
        pa.uint32(): np.uint32,
        pa.uint64(): np.uint64,
        pa.float16(): np.float16,
        pa.float32(): np.float32,
        pa.float64(): np.float64,
    }

    return mapping


@overload
def _arrow_buf_inf(address: int, typestr: str, size: int, stream: None) -> ArrayInf: ...


@overload
def _arrow_buf_inf(
    address: int, typestr: str, size: int, stream: int
) -> CudaArrayInf: ...


def _arrow_buf_inf(
    address: int, typestr: str, size: int, stream: Optional[int]
) -> Union[ArrayInf, CudaArrayInf]:
    if stream is not None:
        jcuaif: CudaArrayInf = {
            "data": (address, True),
            "typestr": typestr,
            "version": 3,
            "strides": None,
            "shape": (size,),
            "mask": None,
            "stream": stream,
        }
        return jcuaif

    jaif: ArrayInf = {
        "data": (address, True),
        "typestr": typestr,
        "version": 3,
        "strides": None,
        "shape": (size,),
        "mask": None,
    }
    return jaif


def _arrow_cat_names_inf(cats: "pa.StringArray") -> Tuple[StringArray, Any]:
    if not TYPE_CHECKING:
        pa = import_pyarrow()

    # FIXME(jiamingy): Account for offset, need to find an implementation that returns
    # offset > 0
    assert cats.offset == 0
    buffers: List[pa.Buffer] = cats.buffers()
    mask, offset, data = buffers
    assert offset.is_cpu

    off_len = len(cats) + 1

    def get_n_bytes(typ: Type) -> int:
        return off_len * (np.iinfo(typ).bits // 8)

    if offset.size == get_n_bytes(np.int64):
        if not isinstance(cats, pa.LargeStringArray):
            arrow_str_error = "Expecting a `pyarrow.Array`."
            raise TypeError(arrow_str_error + f" Got: {type(cats)}.")
        # Convert to 32bit integer, arrow recommends against the use of i64. Also,
        # XGBoost cannot handle large number of categories (> 2**31).
        i32cats = cats.cast(pa.string())
        mask, offset, data = i32cats.buffers()

    if offset.size != get_n_bytes(np.int32):
        raise TypeError(
            "Arrow dictionary type offsets is required to be 32-bit integer."
        )

    joffset = _arrow_buf_inf(offset.address, "<i4", off_len, None)
    jdata = _arrow_buf_inf(data.address, "|i1", data.size, None)
    # Categories should not have missing values.
    assert mask is None

    jnames: StringArray = {"offsets": joffset, "values": jdata}
    return jnames, (mask, offset, data)


def _arrow_array_inf(
    array: "pa.Array",
) -> ArrayInf:
    """Helper for handling categorical codes."""
    if not TYPE_CHECKING:
        pa = import_pyarrow()
    if not isinstance(array, pa.Array):  # pylint: disable=E0606
        raise TypeError(f"Invalid input type: {type(array)}")

    mask, data = array.buffers()
    jdata = make_array_interface(
        data.address,
        shape=(len(array),),
        dtype=_arrow_npdtype()[array.type],
        is_cuda=not data.is_cpu,
    )

    if mask is not None:
        jmask: Optional[ArrayInf] = {
            "data": (mask.address, True),
            "typestr": "<t1",
            "version": 3,
            "strides": None,
            "shape": (len(array),),
            "mask": None,
        }
        if not mask.is_cpu:
            jmask["stream"] = STREAM_PER_THREAD  # type: ignore
    else:
        jmask = None

    jdata["mask"] = jmask
    return jdata


def arrow_cat_inf(  # pylint: disable=too-many-locals
    cats: "pa.StringArray",
    codes: Union[_ArrayLikeArg, _CudaArrayLikeArg, "pa.IntegerArray"],
) -> Tuple[StringArray, ArrayInf, Tuple]:
    """Get the array interface representation of a string-based category array."""
    jnames, cats_tmp = _arrow_cat_names_inf(cats)
    jcodes = _arrow_array_inf(codes)

    return jnames, jcodes, (cats_tmp, None)


def _ensure_np_dtype(
    data: DataType, dtype: Optional[NumpyDType]
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
    """Ensure the np array has correct type and is contiguous."""
    if array_hasobject(data) or data.dtype in [np.float16, np.bool_]:
        dtype = np.float32
        data = data.astype(dtype, copy=False)
    if not data.flags.aligned:
        data = np.require(data, requirements="A")
    return data, dtype


def array_interface_dict(data: np.ndarray) -> ArrayInf:
    """Returns an array interface from the input."""
    if array_hasobject(data):
        raise ValueError("Input data contains `object` dtype.  Expecting numeric data.")
    ainf = data.__array_interface__
    if "mask" in ainf:
        ainf["mask"] = ainf["mask"].__array_interface__
    return cast(ArrayInf, ainf)


def pd_cat_inf(  # pylint: disable=too-many-locals
    cats: DfCatAccessor, codes: "pd.Series"
) -> Tuple[Union[StringArray, ArrayInf], ArrayInf, Tuple]:
    """Get the array interface representation of pandas category accessor."""
    # pandas uses -1 to represent missing values for categorical features
    codes = codes.replace(-1, np.nan)

    if np.issubdtype(cats.dtype, np.floating) or np.issubdtype(cats.dtype, np.integer):
        # Numeric index type
        name_values_num = cats.values
        jarr_values = array_interface_dict(name_values_num)
        code_values = codes.values
        jarr_codes = array_interface_dict(code_values)
        return jarr_values, jarr_codes, (name_values_num, code_values)

    def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]:
        """Convert a numpy string array to an arrow string array."""
        lenarr = np.vectorize(len)
        offsets = np.cumsum(
            np.concatenate([np.array([0], dtype=np.int64), lenarr(strarr)])
        )
        values = strarr.sum()
        assert "\0" not in values  # arrow string array doesn't need null terminal
        return offsets.astype(np.int32), values

    # String index type
    name_offsets, name_values = npstr_to_arrow_strarr(cats.values)
    name_offsets, _ = _ensure_np_dtype(name_offsets, np.int32)
    joffsets = array_interface_dict(name_offsets)
    bvalues = name_values.encode("utf-8")

    ptr = ctypes.c_void_p.from_buffer(ctypes.c_char_p(bvalues)).value
    assert ptr is not None

    jvalues: ArrayInf = {
        "data": (ptr, True),
        "typestr": "|i1",
        "shape": (len(name_values),),
        "strides": None,
        "version": 3,
        "mask": None,
    }
    jnames: StringArray = {"offsets": joffsets, "values": jvalues}

    code_values = codes.values
    jcodes = array_interface_dict(code_values)

    buf = (
        name_offsets,
        name_values,
        bvalues,
        code_values,
    )  # store temporary values
    return jnames, jcodes, buf


def array_interface(data: np.ndarray) -> bytes:
    """Make array interface str."""
    interface = array_interface_dict(data)
    interface_str = bytes(json.dumps(interface), "utf-8")
    return interface_str


def check_cudf_meta(data: _CudaArrayLikeArg, field: str) -> None:
    "Make sure no missing value in meta data."
    if (
        "mask" in data.__cuda_array_interface__
        and data.__cuda_array_interface__["mask"] is not None
    ):
        raise ValueError(f"Missing value is not allowed for: {field}")


class ArrowSchema(ctypes.Structure):
    """The Schema type from arrow C array."""

    _fields_ = [
        ("format", ctypes.c_char_p),
        ("name", ctypes.c_char_p),
        ("metadata", ctypes.c_char_p),
        ("flags", ctypes.c_int64),
        ("n_children", ctypes.c_int64),
        ("children", ctypes.POINTER(ctypes.c_void_p)),
        ("dictionary", ctypes.c_void_p),
        ("release", ctypes.c_void_p),
        ("private_data", ctypes.c_void_p),
    ]


class ArrowArray(ctypes.Structure):
    """The Array type from arrow C array."""


ArrowArray._fields_ = [  # pylint: disable=protected-access
    ("length", ctypes.c_int64),
    ("null_count", ctypes.c_int64),
    ("offset", ctypes.c_int64),
    ("n_buffers", ctypes.c_int64),
    ("n_children", ctypes.c_int64),
    ("buffers", ctypes.POINTER(ctypes.c_void_p)),
    ("children", ctypes.POINTER(ctypes.POINTER(ArrowArray))),
    ("dictionary", ctypes.POINTER(ArrowArray)),
    ("release", ctypes.c_void_p),
    ("private_data", ctypes.c_void_p),
]


class ArrowDeviceArray(ctypes.Structure):
    """The Array type from arrow C device array."""

    _fields_ = [
        ("array", ArrowArray),
        ("device_id", ctypes.c_int64),
        ("device_type", ctypes.c_int32),
        ("sync_event", ctypes.c_void_p),
        ("reserved", ctypes.c_int64 * 3),
    ]


PyCapsule_GetName = ctypes.pythonapi.PyCapsule_GetName
PyCapsule_GetName.restype = ctypes.c_char_p
PyCapsule_GetName.argtypes = [ctypes.py_object]


PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer
PyCapsule_GetPointer.restype = ctypes.c_void_p
PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]


def wait_event(event_hdl: int) -> None:
    """Wait for CUDA event exported by arrow."""
    # cuda-python is a dependency of cuDF.
    from cuda.bindings import runtime as cudart

    event = ctypes.cast(event_hdl, ctypes.POINTER(ctypes.c_int64))
    (status,) = cudart.cudaStreamWaitEvent(
        STREAM_PER_THREAD,
        event.contents.value,
        cudart.cudaEventWaitDefault,
    )
    if status != cudart.cudaError_t.cudaSuccess:
        _, msg = cudart.cudaGetErrorString(status)
        raise ValueError(msg)


def cudf_cat_inf(  # pylint: disable=too-many-locals
    cats: DfCatAccessor, codes: "pd.Series"
) -> Tuple[Union[CudaArrayInf, CudaStringArray], ArrayInf, Tuple]:
    """Obtain the cuda array interface for cuDF categories."""
    cp = import_cupy()
    is_num_idx = cp.issubdtype(cats.dtype, cp.floating) or cp.issubdtype(
        cats.dtype, cp.integer
    )
    if is_num_idx:
        cats_ainf = cuda_array_interface_dict(cats)
        codes_ainf = cuda_array_interface_dict(codes)
        return cats_ainf, codes_ainf, (cats, codes)

    # pylint: disable=protected-access
    arrow_col = cats._column.to_pylibcudf(mode="read")
    # Tuple[types.CapsuleType, types.CapsuleType]
    schema, array = arrow_col.__arrow_c_device_array__()

    array_ptr = PyCapsule_GetPointer(array, PyCapsule_GetName(array))
    schema_ptr = PyCapsule_GetPointer(schema, PyCapsule_GetName(schema))

    # Cast to arrow array
    arrow_device_array = ctypes.cast(
        array_ptr, ctypes.POINTER(ArrowDeviceArray)
    ).contents
    wait_event(arrow_device_array.sync_event)
    assert arrow_device_array.device_type == 2  # 2 is CUDA

    arrow_array = arrow_device_array.array
    mask, offset, data = (
        arrow_array.buffers[0],
        arrow_array.buffers[1],
        arrow_array.buffers[2],
    )
    # Categories should not have missing values.
    assert mask is None
    assert arrow_array.n_children == 0
    assert arrow_array.n_buffers == 3
    assert arrow_array.offset == 0

    # Cast to ArrowSchema
    arrow_schema = ctypes.cast(schema_ptr, ctypes.POINTER(ArrowSchema)).contents
    assert arrow_schema.format in (b"u", b"U", b"vu")  # utf8, large utf8
    if arrow_schema.format in (b"u", b"vu"):
        joffset: CudaArrayInf = _arrow_buf_inf(
            offset, "<i4", arrow_array.length + 1, STREAM_PER_THREAD
        )
    elif arrow_schema.format == b"U":
        raise TypeError("Large string for category index (names) is not supported.")
    else:
        raise TypeError(
            "Unexpected type for category index. It's neither numeric nor string."
        )
    # 0 size for unknown
    jdata: CudaArrayInf = _arrow_buf_inf(data, "|i1", 0, STREAM_PER_THREAD)
    jnames: CudaStringArray = {
        "offsets": joffset,
        "values": jdata,
    }

    jcodes = cuda_array_interface_dict(codes)
    return jnames, jcodes, (arrow_col,)


class Categories:
    """An internal storage class for categories returned by the DMatrix and the
    Booster. This class is designed to be opaque. It is intended to be used exclusively
    by XGBoost as an intermediate storage for re-coding categorical data.

    The categories are saved along with the booster object. As a result, users don't
    need to preserve this class for re-coding. Use the booster model IO instead if you
    want to preserve the categories in a stable format.

    .. versionadded:: 3.1.0

    .. warning::

        This class is internal.

    .. code-block:: python

        Xy = xgboost.QuantileDMatrix(X, y, enable_categorical=True)
        booster = xgboost.train({}, Xy)

        categories = booster.get_categories() # Get categories

        # Use categories as a reference for re-coding
        Xy_new = xgboost.QuantileDMatrix(
            X_new, y_new, feature_types=categories, enable_categorical=True, ref=Xy
        )

        # Categories will be part of the `model.json`.
        booster.save_model("model.json")

    """

    def __init__(
        self,
        handle: Tuple[ctypes.c_void_p, Callable[[], None]],
        arrow_arrays: Optional[ArrowCatList],
    ) -> None:
        # The handle type is a bundle of the handle and the free call. Otherwise, we
        # will have to import the lib and checkcall inside the __del__ method from the
        # core module to avoid cyclic model dependency. Importing modules in __del__ can
        # result in Python abort if __del__ is called during exception handling
        # (interpreter is shutting down).
        self._handle, self._free = handle
        self._arrow_arrays = arrow_arrays

    def to_arrow(self) -> ArrowCatList:
        """Get the categories in the dataset. The results are stored in a list of
        (feature name, arrow array) pairs, with one array for each categorical
        feature. If a feature is numerical, then the corresponding column in the list is
        None. A value error will be raised if this container was created without the
        `export_to_arrow` option.

        """
        if self._arrow_arrays is None:
            raise ValueError(
                "The `export_to_arrow` option of the `get_categories` method"
                " is required."
            )
        return self._arrow_arrays

    def empty(self) -> bool:
        """Returns True if there's no category."""
        return self._handle.value is None

    def get_handle(self) -> int:
        """Internal method for retrieving the handle."""
        assert self._handle.value
        return self._handle.value

    def __del__(self) -> None:
        if self._handle.value is None:
            return
        self._free()


def get_ref_categories(
    feature_types: Optional[Union[FeatureTypes, Categories]],
) -> Tuple[Optional[FeatureTypes], Optional[Categories]]:
    """Get the optional reference categories from the `feature_types`. This is used by
    various `DMatrix` where the `feature_types` is reused for specifying the reference
    categories.

    """
    if isinstance(feature_types, Categories):
        ref_categories = feature_types
        feature_types = None
    else:
        ref_categories = None
    return feature_types, ref_categories


# Type schema for storing JSON-encoded array interface
AifType: TypeAlias = List[
    Union[
        # numeric column
        Union[ArrayInf, CudaArrayInf],
        # categorical column
        Tuple[
            # (cuda) numeric index | (cuda) string index
            Union[ArrayInf, CudaArrayInf, StringArray, CudaStringArray],
            Union[ArrayInf, CudaArrayInf],  # codes
        ],
    ]
]


class TransformedDf(ABC):
    """Internal class for storing transformed dataframe.

    Parameters
    ----------
    ref_categories :
        Optional reference categories used for re-coding.

    aitfs :
        Array interface for each column.

    """

    temporary_buffers: List[Tuple] = []

    def __init__(self, ref_categories: Optional[Categories], aitfs: AifType) -> None:
        self.ref_categories = ref_categories
        if ref_categories is not None and ref_categories.get_handle() is not None:
            aif = ref_categories.get_handle()
            self.ref_aif: Optional[int] = aif
        else:
            self.ref_aif = None

        self.aitfs = aitfs

    def array_interface(self) -> bytes:
        """Return a byte string for JSON encoded array interface."""
        if self.ref_categories is not None:
            ref_inf: dict = {"ref_categories": self.ref_aif, "columns": self.aitfs}
            inf = bytes(json.dumps(ref_inf), "utf-8")
        else:
            inf = bytes(json.dumps(self.aitfs), "utf-8")
        return inf

    @property
    @abstractmethod
    def shape(self) -> Tuple[int, int]:
        """Return the shape of the dataframe."""