Files
MLPproject/.venv/lib/python3.12/site-packages/xgboost/_data_utils.py
2025-10-23 15:44:32 +02:00

752 lines
23 KiB
Python

"""Helpers for interfacing array like objects."""
import copy
import ctypes
import json
from abc import ABC, abstractmethod
from functools import cache as fcache
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
List,
Literal,
Optional,
Protocol,
Tuple,
Type,
TypeAlias,
TypedDict,
TypeGuard,
Union,
cast,
overload,
)
import numpy as np
from ._typing import (
ArrowCatList,
CNumericPtr,
DataType,
FeatureTypes,
NumpyDType,
NumpyOrCupy,
)
from .compat import import_cupy, import_pyarrow, lazy_isinstance
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# Used for accepting inputs for numpy and cupy arrays
class _ArrayLikeArg(Protocol):
@property
def __array_interface__(self) -> "ArrayInf": ...
class _CudaArrayLikeArg(Protocol):
@property
def __cuda_array_interface__(self) -> "CudaArrayInf": ...
ArrayInf = TypedDict(
"ArrayInf",
{
"data": Tuple[int, bool],
"typestr": str,
"version": Literal[3],
"strides": Optional[Tuple[int, ...]],
"shape": Tuple[int, ...],
"mask": Union["ArrayInf", None, _ArrayLikeArg],
},
)
CudaArrayInf = TypedDict(
"CudaArrayInf",
{
"data": Tuple[int, bool],
"typestr": str,
"version": Literal[3],
"strides": Optional[Tuple[int, ...]],
"shape": Tuple[int, ...],
"mask": Union["ArrayInf", None, _ArrayLikeArg],
"stream": int,
},
)
StringArray = TypedDict("StringArray", {"offsets": ArrayInf, "values": ArrayInf})
CudaStringArray = TypedDict(
"CudaStringArray", {"offsets": CudaArrayInf, "values": CudaArrayInf}
)
def array_hasobject(data: DataType) -> bool:
"""Whether the numpy array has object dtype."""
return (
hasattr(data, "dtype")
and hasattr(data.dtype, "hasobject")
and data.dtype.hasobject
)
def cuda_array_interface_dict(data: _CudaArrayLikeArg) -> CudaArrayInf:
"""Returns a dictionary storing the CUDA array interface."""
if array_hasobject(data):
raise ValueError("Input data contains `object` dtype. Expecting numeric data.")
ainf = data.__cuda_array_interface__
if "mask" in ainf:
ainf["mask"] = ainf["mask"].__cuda_array_interface__ # type: ignore
return ainf
def cuda_array_interface(data: _CudaArrayLikeArg) -> bytes:
"""Make cuda array interface str."""
interface = cuda_array_interface_dict(data)
interface_str = bytes(json.dumps(interface), "utf-8")
return interface_str
def from_array_interface(interface: ArrayInf, zero_copy: bool = False) -> NumpyOrCupy:
"""Convert array interface to numpy or cupy array"""
class Array:
"""Wrapper type for communicating with numpy and cupy."""
_interface: Optional[ArrayInf] = None
@property
def __array_interface__(self) -> Optional[ArrayInf]:
return self._interface
@__array_interface__.setter
def __array_interface__(self, interface: ArrayInf) -> None:
self._interface = copy.copy(interface)
# Convert some fields to tuple as required by numpy
self._interface["shape"] = tuple(self._interface["shape"])
self._interface["data"] = (
self._interface["data"][0],
self._interface["data"][1],
)
strides = self._interface.get("strides", None)
if strides is not None:
self._interface["strides"] = tuple(strides)
@property
def __cuda_array_interface__(self) -> Optional[ArrayInf]:
return self.__array_interface__
@__cuda_array_interface__.setter
def __cuda_array_interface__(self, interface: ArrayInf) -> None:
self.__array_interface__ = interface
arr = Array()
if "stream" in interface:
# CUDA stream is presented, this is a __cuda_array_interface__.
arr.__cuda_array_interface__ = interface
out = import_cupy().array(arr, copy=not zero_copy)
else:
arr.__array_interface__ = interface
out = np.array(arr, copy=not zero_copy)
return out
# Default constant value for CUDA per-thread stream.
STREAM_PER_THREAD = 2
# Typing is not strict as there are subtle differences between CUDA array interface and
# array interface. We handle them uniformly for now.
def make_array_interface(
ptr: Union[CNumericPtr, int],
shape: Tuple[int, ...],
dtype: Type[np.number],
is_cuda: bool,
) -> ArrayInf:
"""Make an __(cuda)_array_interface__ from a pointer."""
# Use an empty array to handle typestr and descr
if is_cuda:
empty = import_cupy().empty(shape=(0,), dtype=dtype)
array = empty.__cuda_array_interface__ # pylint: disable=no-member
else:
empty = np.empty(shape=(0,), dtype=dtype)
array = empty.__array_interface__ # pylint: disable=no-member
if not isinstance(ptr, int):
addr = ctypes.cast(ptr, ctypes.c_void_p).value
else:
addr = ptr
length = int(np.prod(shape))
# Handle empty dataset.
assert addr is not None or length == 0
if addr is None:
return array
array["data"] = (addr, True)
if is_cuda and "stream" not in array:
array["stream"] = STREAM_PER_THREAD
array["shape"] = shape
array["strides"] = None
return array
def is_arrow_dict(data: Any) -> TypeGuard["pa.DictionaryArray"]:
"""Is this an arrow dictionary array?"""
return lazy_isinstance(data, "pyarrow.lib", "DictionaryArray")
class DfCatAccessor(Protocol):
"""Protocol for pandas cat accessor."""
@property
def categories( # pylint: disable=missing-function-docstring
self,
) -> "pd.Index": ...
@property
def codes(self) -> "pd.Series": ... # pylint: disable=missing-function-docstring
@property
def dtype(self) -> np.dtype: ... # pylint: disable=missing-function-docstring
@property
def values(self) -> np.ndarray: ... # pylint: disable=missing-function-docstring
def to_arrow( # pylint: disable=missing-function-docstring
self,
) -> Union["pa.StringArray", "pa.IntegerArray"]: ...
@property
def __cuda_array_interface__(self) -> CudaArrayInf: ...
@property
def _column(self) -> Any: ...
def _is_df_cat(data: Any) -> TypeGuard[DfCatAccessor]:
# Test pd.Series.cat, not pd.Series
return hasattr(data, "categories") and hasattr(data, "codes")
@fcache
def _arrow_npdtype() -> Dict[Any, Type[np.number]]:
import pyarrow as pa
mapping: Dict[Any, Type[np.number]] = {
pa.int8(): np.int8,
pa.int16(): np.int16,
pa.int32(): np.int32,
pa.int64(): np.int64,
pa.uint8(): np.uint8,
pa.uint16(): np.uint16,
pa.uint32(): np.uint32,
pa.uint64(): np.uint64,
pa.float16(): np.float16,
pa.float32(): np.float32,
pa.float64(): np.float64,
}
return mapping
@overload
def _arrow_buf_inf(address: int, typestr: str, size: int, stream: None) -> ArrayInf: ...
@overload
def _arrow_buf_inf(
address: int, typestr: str, size: int, stream: int
) -> CudaArrayInf: ...
def _arrow_buf_inf(
address: int, typestr: str, size: int, stream: Optional[int]
) -> Union[ArrayInf, CudaArrayInf]:
if stream is not None:
jcuaif: CudaArrayInf = {
"data": (address, True),
"typestr": typestr,
"version": 3,
"strides": None,
"shape": (size,),
"mask": None,
"stream": stream,
}
return jcuaif
jaif: ArrayInf = {
"data": (address, True),
"typestr": typestr,
"version": 3,
"strides": None,
"shape": (size,),
"mask": None,
}
return jaif
def _arrow_cat_names_inf(cats: "pa.StringArray") -> Tuple[StringArray, Any]:
if not TYPE_CHECKING:
pa = import_pyarrow()
# FIXME(jiamingy): Account for offset, need to find an implementation that returns
# offset > 0
assert cats.offset == 0
buffers: List[pa.Buffer] = cats.buffers()
mask, offset, data = buffers
assert offset.is_cpu
off_len = len(cats) + 1
def get_n_bytes(typ: Type) -> int:
return off_len * (np.iinfo(typ).bits // 8)
if offset.size == get_n_bytes(np.int64):
if not isinstance(cats, pa.LargeStringArray):
arrow_str_error = "Expecting a `pyarrow.Array`."
raise TypeError(arrow_str_error + f" Got: {type(cats)}.")
# Convert to 32bit integer, arrow recommends against the use of i64. Also,
# XGBoost cannot handle large number of categories (> 2**31).
i32cats = cats.cast(pa.string())
mask, offset, data = i32cats.buffers()
if offset.size != get_n_bytes(np.int32):
raise TypeError(
"Arrow dictionary type offsets is required to be 32-bit integer."
)
joffset = _arrow_buf_inf(offset.address, "<i4", off_len, None)
jdata = _arrow_buf_inf(data.address, "|i1", data.size, None)
# Categories should not have missing values.
assert mask is None
jnames: StringArray = {"offsets": joffset, "values": jdata}
return jnames, (mask, offset, data)
def _arrow_array_inf(
array: "pa.Array",
) -> ArrayInf:
"""Helper for handling categorical codes."""
if not TYPE_CHECKING:
pa = import_pyarrow()
if not isinstance(array, pa.Array): # pylint: disable=E0606
raise TypeError(f"Invalid input type: {type(array)}")
mask, data = array.buffers()
jdata = make_array_interface(
data.address,
shape=(len(array),),
dtype=_arrow_npdtype()[array.type],
is_cuda=not data.is_cpu,
)
if mask is not None:
jmask: Optional[ArrayInf] = {
"data": (mask.address, True),
"typestr": "<t1",
"version": 3,
"strides": None,
"shape": (len(array),),
"mask": None,
}
if not mask.is_cpu:
jmask["stream"] = STREAM_PER_THREAD # type: ignore
else:
jmask = None
jdata["mask"] = jmask
return jdata
def arrow_cat_inf( # pylint: disable=too-many-locals
cats: "pa.StringArray",
codes: Union[_ArrayLikeArg, _CudaArrayLikeArg, "pa.IntegerArray"],
) -> Tuple[StringArray, ArrayInf, Tuple]:
"""Get the array interface representation of a string-based category array."""
jnames, cats_tmp = _arrow_cat_names_inf(cats)
jcodes = _arrow_array_inf(codes)
return jnames, jcodes, (cats_tmp, None)
def _ensure_np_dtype(
data: DataType, dtype: Optional[NumpyDType]
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
"""Ensure the np array has correct type and is contiguous."""
if array_hasobject(data) or data.dtype in [np.float16, np.bool_]:
dtype = np.float32
data = data.astype(dtype, copy=False)
if not data.flags.aligned:
data = np.require(data, requirements="A")
return data, dtype
def array_interface_dict(data: np.ndarray) -> ArrayInf:
"""Returns an array interface from the input."""
if array_hasobject(data):
raise ValueError("Input data contains `object` dtype. Expecting numeric data.")
ainf = data.__array_interface__
if "mask" in ainf:
ainf["mask"] = ainf["mask"].__array_interface__
return cast(ArrayInf, ainf)
def pd_cat_inf( # pylint: disable=too-many-locals
cats: DfCatAccessor, codes: "pd.Series"
) -> Tuple[Union[StringArray, ArrayInf], ArrayInf, Tuple]:
"""Get the array interface representation of pandas category accessor."""
# pandas uses -1 to represent missing values for categorical features
codes = codes.replace(-1, np.nan)
if np.issubdtype(cats.dtype, np.floating) or np.issubdtype(cats.dtype, np.integer):
# Numeric index type
name_values_num = cats.values
jarr_values = array_interface_dict(name_values_num)
code_values = codes.values
jarr_codes = array_interface_dict(code_values)
return jarr_values, jarr_codes, (name_values_num, code_values)
def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]:
"""Convert a numpy string array to an arrow string array."""
lenarr = np.vectorize(len)
offsets = np.cumsum(
np.concatenate([np.array([0], dtype=np.int64), lenarr(strarr)])
)
values = strarr.sum()
assert "\0" not in values # arrow string array doesn't need null terminal
return offsets.astype(np.int32), values
# String index type
name_offsets, name_values = npstr_to_arrow_strarr(cats.values)
name_offsets, _ = _ensure_np_dtype(name_offsets, np.int32)
joffsets = array_interface_dict(name_offsets)
bvalues = name_values.encode("utf-8")
ptr = ctypes.c_void_p.from_buffer(ctypes.c_char_p(bvalues)).value
assert ptr is not None
jvalues: ArrayInf = {
"data": (ptr, True),
"typestr": "|i1",
"shape": (len(name_values),),
"strides": None,
"version": 3,
"mask": None,
}
jnames: StringArray = {"offsets": joffsets, "values": jvalues}
code_values = codes.values
jcodes = array_interface_dict(code_values)
buf = (
name_offsets,
name_values,
bvalues,
code_values,
) # store temporary values
return jnames, jcodes, buf
def array_interface(data: np.ndarray) -> bytes:
"""Make array interface str."""
interface = array_interface_dict(data)
interface_str = bytes(json.dumps(interface), "utf-8")
return interface_str
def check_cudf_meta(data: _CudaArrayLikeArg, field: str) -> None:
"Make sure no missing value in meta data."
if (
"mask" in data.__cuda_array_interface__
and data.__cuda_array_interface__["mask"] is not None
):
raise ValueError(f"Missing value is not allowed for: {field}")
class ArrowSchema(ctypes.Structure):
"""The Schema type from arrow C array."""
_fields_ = [
("format", ctypes.c_char_p),
("name", ctypes.c_char_p),
("metadata", ctypes.c_char_p),
("flags", ctypes.c_int64),
("n_children", ctypes.c_int64),
("children", ctypes.POINTER(ctypes.c_void_p)),
("dictionary", ctypes.c_void_p),
("release", ctypes.c_void_p),
("private_data", ctypes.c_void_p),
]
class ArrowArray(ctypes.Structure):
"""The Array type from arrow C array."""
ArrowArray._fields_ = [ # pylint: disable=protected-access
("length", ctypes.c_int64),
("null_count", ctypes.c_int64),
("offset", ctypes.c_int64),
("n_buffers", ctypes.c_int64),
("n_children", ctypes.c_int64),
("buffers", ctypes.POINTER(ctypes.c_void_p)),
("children", ctypes.POINTER(ctypes.POINTER(ArrowArray))),
("dictionary", ctypes.POINTER(ArrowArray)),
("release", ctypes.c_void_p),
("private_data", ctypes.c_void_p),
]
class ArrowDeviceArray(ctypes.Structure):
"""The Array type from arrow C device array."""
_fields_ = [
("array", ArrowArray),
("device_id", ctypes.c_int64),
("device_type", ctypes.c_int32),
("sync_event", ctypes.c_void_p),
("reserved", ctypes.c_int64 * 3),
]
PyCapsule_GetName = ctypes.pythonapi.PyCapsule_GetName
PyCapsule_GetName.restype = ctypes.c_char_p
PyCapsule_GetName.argtypes = [ctypes.py_object]
PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer
PyCapsule_GetPointer.restype = ctypes.c_void_p
PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]
def wait_event(event_hdl: int) -> None:
"""Wait for CUDA event exported by arrow."""
# cuda-python is a dependency of cuDF.
from cuda.bindings import runtime as cudart
event = ctypes.cast(event_hdl, ctypes.POINTER(ctypes.c_int64))
(status,) = cudart.cudaStreamWaitEvent(
STREAM_PER_THREAD,
event.contents.value,
cudart.cudaEventWaitDefault,
)
if status != cudart.cudaError_t.cudaSuccess:
_, msg = cudart.cudaGetErrorString(status)
raise ValueError(msg)
def cudf_cat_inf( # pylint: disable=too-many-locals
cats: DfCatAccessor, codes: "pd.Series"
) -> Tuple[Union[CudaArrayInf, CudaStringArray], ArrayInf, Tuple]:
"""Obtain the cuda array interface for cuDF categories."""
cp = import_cupy()
is_num_idx = cp.issubdtype(cats.dtype, cp.floating) or cp.issubdtype(
cats.dtype, cp.integer
)
if is_num_idx:
cats_ainf = cuda_array_interface_dict(cats)
codes_ainf = cuda_array_interface_dict(codes)
return cats_ainf, codes_ainf, (cats, codes)
# pylint: disable=protected-access
arrow_col = cats._column.to_pylibcudf(mode="read")
# Tuple[types.CapsuleType, types.CapsuleType]
schema, array = arrow_col.__arrow_c_device_array__()
array_ptr = PyCapsule_GetPointer(array, PyCapsule_GetName(array))
schema_ptr = PyCapsule_GetPointer(schema, PyCapsule_GetName(schema))
# Cast to arrow array
arrow_device_array = ctypes.cast(
array_ptr, ctypes.POINTER(ArrowDeviceArray)
).contents
wait_event(arrow_device_array.sync_event)
assert arrow_device_array.device_type == 2 # 2 is CUDA
arrow_array = arrow_device_array.array
mask, offset, data = (
arrow_array.buffers[0],
arrow_array.buffers[1],
arrow_array.buffers[2],
)
# Categories should not have missing values.
assert mask is None
assert arrow_array.n_children == 0
assert arrow_array.n_buffers == 3
assert arrow_array.offset == 0
# Cast to ArrowSchema
arrow_schema = ctypes.cast(schema_ptr, ctypes.POINTER(ArrowSchema)).contents
assert arrow_schema.format in (b"u", b"U", b"vu") # utf8, large utf8
if arrow_schema.format in (b"u", b"vu"):
joffset: CudaArrayInf = _arrow_buf_inf(
offset, "<i4", arrow_array.length + 1, STREAM_PER_THREAD
)
elif arrow_schema.format == b"U":
raise TypeError("Large string for category index (names) is not supported.")
else:
raise TypeError(
"Unexpected type for category index. It's neither numeric nor string."
)
# 0 size for unknown
jdata: CudaArrayInf = _arrow_buf_inf(data, "|i1", 0, STREAM_PER_THREAD)
jnames: CudaStringArray = {
"offsets": joffset,
"values": jdata,
}
jcodes = cuda_array_interface_dict(codes)
return jnames, jcodes, (arrow_col,)
class Categories:
"""An internal storage class for categories returned by the DMatrix and the
Booster. This class is designed to be opaque. It is intended to be used exclusively
by XGBoost as an intermediate storage for re-coding categorical data.
The categories are saved along with the booster object. As a result, users don't
need to preserve this class for re-coding. Use the booster model IO instead if you
want to preserve the categories in a stable format.
.. versionadded:: 3.1.0
.. warning::
This class is internal.
.. code-block:: python
Xy = xgboost.QuantileDMatrix(X, y, enable_categorical=True)
booster = xgboost.train({}, Xy)
categories = booster.get_categories() # Get categories
# Use categories as a reference for re-coding
Xy_new = xgboost.QuantileDMatrix(
X_new, y_new, feature_types=categories, enable_categorical=True, ref=Xy
)
# Categories will be part of the `model.json`.
booster.save_model("model.json")
"""
def __init__(
self,
handle: Tuple[ctypes.c_void_p, Callable[[], None]],
arrow_arrays: Optional[ArrowCatList],
) -> None:
# The handle type is a bundle of the handle and the free call. Otherwise, we
# will have to import the lib and checkcall inside the __del__ method from the
# core module to avoid cyclic model dependency. Importing modules in __del__ can
# result in Python abort if __del__ is called during exception handling
# (interpreter is shutting down).
self._handle, self._free = handle
self._arrow_arrays = arrow_arrays
def to_arrow(self) -> ArrowCatList:
"""Get the categories in the dataset. The results are stored in a list of
(feature name, arrow array) pairs, with one array for each categorical
feature. If a feature is numerical, then the corresponding column in the list is
None. A value error will be raised if this container was created without the
`export_to_arrow` option.
"""
if self._arrow_arrays is None:
raise ValueError(
"The `export_to_arrow` option of the `get_categories` method"
" is required."
)
return self._arrow_arrays
def empty(self) -> bool:
"""Returns True if there's no category."""
return self._handle.value is None
def get_handle(self) -> int:
"""Internal method for retrieving the handle."""
assert self._handle.value
return self._handle.value
def __del__(self) -> None:
if self._handle.value is None:
return
self._free()
def get_ref_categories(
feature_types: Optional[Union[FeatureTypes, Categories]],
) -> Tuple[Optional[FeatureTypes], Optional[Categories]]:
"""Get the optional reference categories from the `feature_types`. This is used by
various `DMatrix` where the `feature_types` is reused for specifying the reference
categories.
"""
if isinstance(feature_types, Categories):
ref_categories = feature_types
feature_types = None
else:
ref_categories = None
return feature_types, ref_categories
# Type schema for storing JSON-encoded array interface
AifType: TypeAlias = List[
Union[
# numeric column
Union[ArrayInf, CudaArrayInf],
# categorical column
Tuple[
# (cuda) numeric index | (cuda) string index
Union[ArrayInf, CudaArrayInf, StringArray, CudaStringArray],
Union[ArrayInf, CudaArrayInf], # codes
],
]
]
class TransformedDf(ABC):
"""Internal class for storing transformed dataframe.
Parameters
----------
ref_categories :
Optional reference categories used for re-coding.
aitfs :
Array interface for each column.
"""
temporary_buffers: List[Tuple] = []
def __init__(self, ref_categories: Optional[Categories], aitfs: AifType) -> None:
self.ref_categories = ref_categories
if ref_categories is not None and ref_categories.get_handle() is not None:
aif = ref_categories.get_handle()
self.ref_aif: Optional[int] = aif
else:
self.ref_aif = None
self.aitfs = aitfs
def array_interface(self) -> bytes:
"""Return a byte string for JSON encoded array interface."""
if self.ref_categories is not None:
ref_inf: dict = {"ref_categories": self.ref_aif, "columns": self.aitfs}
inf = bytes(json.dumps(ref_inf), "utf-8")
else:
inf = bytes(json.dumps(self.aitfs), "utf-8")
return inf
@property
@abstractmethod
def shape(self) -> Tuple[int, int]:
"""Return the shape of the dataframe."""