"""Helpers for interfacing array like objects.""" import copy import ctypes import json from abc import ABC, abstractmethod from functools import cache as fcache from typing import ( TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Protocol, Tuple, Type, TypeAlias, TypedDict, TypeGuard, Union, cast, overload, ) import numpy as np from ._typing import ( ArrowCatList, CNumericPtr, DataType, FeatureTypes, NumpyDType, NumpyOrCupy, ) from .compat import import_cupy, import_pyarrow, lazy_isinstance if TYPE_CHECKING: import pandas as pd import pyarrow as pa # Used for accepting inputs for numpy and cupy arrays class _ArrayLikeArg(Protocol): @property def __array_interface__(self) -> "ArrayInf": ... class _CudaArrayLikeArg(Protocol): @property def __cuda_array_interface__(self) -> "CudaArrayInf": ... ArrayInf = TypedDict( "ArrayInf", { "data": Tuple[int, bool], "typestr": str, "version": Literal[3], "strides": Optional[Tuple[int, ...]], "shape": Tuple[int, ...], "mask": Union["ArrayInf", None, _ArrayLikeArg], }, ) CudaArrayInf = TypedDict( "CudaArrayInf", { "data": Tuple[int, bool], "typestr": str, "version": Literal[3], "strides": Optional[Tuple[int, ...]], "shape": Tuple[int, ...], "mask": Union["ArrayInf", None, _ArrayLikeArg], "stream": int, }, ) StringArray = TypedDict("StringArray", {"offsets": ArrayInf, "values": ArrayInf}) CudaStringArray = TypedDict( "CudaStringArray", {"offsets": CudaArrayInf, "values": CudaArrayInf} ) def array_hasobject(data: DataType) -> bool: """Whether the numpy array has object dtype.""" return ( hasattr(data, "dtype") and hasattr(data.dtype, "hasobject") and data.dtype.hasobject ) def cuda_array_interface_dict(data: _CudaArrayLikeArg) -> CudaArrayInf: """Returns a dictionary storing the CUDA array interface.""" if array_hasobject(data): raise ValueError("Input data contains `object` dtype. Expecting numeric data.") ainf = data.__cuda_array_interface__ if "mask" in ainf: ainf["mask"] = ainf["mask"].__cuda_array_interface__ # type: ignore return ainf def cuda_array_interface(data: _CudaArrayLikeArg) -> bytes: """Make cuda array interface str.""" interface = cuda_array_interface_dict(data) interface_str = bytes(json.dumps(interface), "utf-8") return interface_str def from_array_interface(interface: ArrayInf, zero_copy: bool = False) -> NumpyOrCupy: """Convert array interface to numpy or cupy array""" class Array: """Wrapper type for communicating with numpy and cupy.""" _interface: Optional[ArrayInf] = None @property def __array_interface__(self) -> Optional[ArrayInf]: return self._interface @__array_interface__.setter def __array_interface__(self, interface: ArrayInf) -> None: self._interface = copy.copy(interface) # Convert some fields to tuple as required by numpy self._interface["shape"] = tuple(self._interface["shape"]) self._interface["data"] = ( self._interface["data"][0], self._interface["data"][1], ) strides = self._interface.get("strides", None) if strides is not None: self._interface["strides"] = tuple(strides) @property def __cuda_array_interface__(self) -> Optional[ArrayInf]: return self.__array_interface__ @__cuda_array_interface__.setter def __cuda_array_interface__(self, interface: ArrayInf) -> None: self.__array_interface__ = interface arr = Array() if "stream" in interface: # CUDA stream is presented, this is a __cuda_array_interface__. arr.__cuda_array_interface__ = interface out = import_cupy().array(arr, copy=not zero_copy) else: arr.__array_interface__ = interface out = np.array(arr, copy=not zero_copy) return out # Default constant value for CUDA per-thread stream. STREAM_PER_THREAD = 2 # Typing is not strict as there are subtle differences between CUDA array interface and # array interface. We handle them uniformly for now. def make_array_interface( ptr: Union[CNumericPtr, int], shape: Tuple[int, ...], dtype: Type[np.number], is_cuda: bool, ) -> ArrayInf: """Make an __(cuda)_array_interface__ from a pointer.""" # Use an empty array to handle typestr and descr if is_cuda: empty = import_cupy().empty(shape=(0,), dtype=dtype) array = empty.__cuda_array_interface__ # pylint: disable=no-member else: empty = np.empty(shape=(0,), dtype=dtype) array = empty.__array_interface__ # pylint: disable=no-member if not isinstance(ptr, int): addr = ctypes.cast(ptr, ctypes.c_void_p).value else: addr = ptr length = int(np.prod(shape)) # Handle empty dataset. assert addr is not None or length == 0 if addr is None: return array array["data"] = (addr, True) if is_cuda and "stream" not in array: array["stream"] = STREAM_PER_THREAD array["shape"] = shape array["strides"] = None return array def is_arrow_dict(data: Any) -> TypeGuard["pa.DictionaryArray"]: """Is this an arrow dictionary array?""" return lazy_isinstance(data, "pyarrow.lib", "DictionaryArray") class DfCatAccessor(Protocol): """Protocol for pandas cat accessor.""" @property def categories( # pylint: disable=missing-function-docstring self, ) -> "pd.Index": ... @property def codes(self) -> "pd.Series": ... # pylint: disable=missing-function-docstring @property def dtype(self) -> np.dtype: ... # pylint: disable=missing-function-docstring @property def values(self) -> np.ndarray: ... # pylint: disable=missing-function-docstring def to_arrow( # pylint: disable=missing-function-docstring self, ) -> Union["pa.StringArray", "pa.IntegerArray"]: ... @property def __cuda_array_interface__(self) -> CudaArrayInf: ... @property def _column(self) -> Any: ... def _is_df_cat(data: Any) -> TypeGuard[DfCatAccessor]: # Test pd.Series.cat, not pd.Series return hasattr(data, "categories") and hasattr(data, "codes") @fcache def _arrow_npdtype() -> Dict[Any, Type[np.number]]: import pyarrow as pa mapping: Dict[Any, Type[np.number]] = { pa.int8(): np.int8, pa.int16(): np.int16, pa.int32(): np.int32, pa.int64(): np.int64, pa.uint8(): np.uint8, pa.uint16(): np.uint16, pa.uint32(): np.uint32, pa.uint64(): np.uint64, pa.float16(): np.float16, pa.float32(): np.float32, pa.float64(): np.float64, } return mapping @overload def _arrow_buf_inf(address: int, typestr: str, size: int, stream: None) -> ArrayInf: ... @overload def _arrow_buf_inf( address: int, typestr: str, size: int, stream: int ) -> CudaArrayInf: ... def _arrow_buf_inf( address: int, typestr: str, size: int, stream: Optional[int] ) -> Union[ArrayInf, CudaArrayInf]: if stream is not None: jcuaif: CudaArrayInf = { "data": (address, True), "typestr": typestr, "version": 3, "strides": None, "shape": (size,), "mask": None, "stream": stream, } return jcuaif jaif: ArrayInf = { "data": (address, True), "typestr": typestr, "version": 3, "strides": None, "shape": (size,), "mask": None, } return jaif def _arrow_cat_names_inf(cats: "pa.StringArray") -> Tuple[StringArray, Any]: if not TYPE_CHECKING: pa = import_pyarrow() # FIXME(jiamingy): Account for offset, need to find an implementation that returns # offset > 0 assert cats.offset == 0 buffers: List[pa.Buffer] = cats.buffers() mask, offset, data = buffers assert offset.is_cpu off_len = len(cats) + 1 def get_n_bytes(typ: Type) -> int: return off_len * (np.iinfo(typ).bits // 8) if offset.size == get_n_bytes(np.int64): if not isinstance(cats, pa.LargeStringArray): arrow_str_error = "Expecting a `pyarrow.Array`." raise TypeError(arrow_str_error + f" Got: {type(cats)}.") # Convert to 32bit integer, arrow recommends against the use of i64. Also, # XGBoost cannot handle large number of categories (> 2**31). i32cats = cats.cast(pa.string()) mask, offset, data = i32cats.buffers() if offset.size != get_n_bytes(np.int32): raise TypeError( "Arrow dictionary type offsets is required to be 32-bit integer." ) joffset = _arrow_buf_inf(offset.address, " ArrayInf: """Helper for handling categorical codes.""" if not TYPE_CHECKING: pa = import_pyarrow() if not isinstance(array, pa.Array): # pylint: disable=E0606 raise TypeError(f"Invalid input type: {type(array)}") mask, data = array.buffers() jdata = make_array_interface( data.address, shape=(len(array),), dtype=_arrow_npdtype()[array.type], is_cuda=not data.is_cpu, ) if mask is not None: jmask: Optional[ArrayInf] = { "data": (mask.address, True), "typestr": " Tuple[StringArray, ArrayInf, Tuple]: """Get the array interface representation of a string-based category array.""" jnames, cats_tmp = _arrow_cat_names_inf(cats) jcodes = _arrow_array_inf(codes) return jnames, jcodes, (cats_tmp, None) def _ensure_np_dtype( data: DataType, dtype: Optional[NumpyDType] ) -> Tuple[np.ndarray, Optional[NumpyDType]]: """Ensure the np array has correct type and is contiguous.""" if array_hasobject(data) or data.dtype in [np.float16, np.bool_]: dtype = np.float32 data = data.astype(dtype, copy=False) if not data.flags.aligned: data = np.require(data, requirements="A") return data, dtype def array_interface_dict(data: np.ndarray) -> ArrayInf: """Returns an array interface from the input.""" if array_hasobject(data): raise ValueError("Input data contains `object` dtype. Expecting numeric data.") ainf = data.__array_interface__ if "mask" in ainf: ainf["mask"] = ainf["mask"].__array_interface__ return cast(ArrayInf, ainf) def pd_cat_inf( # pylint: disable=too-many-locals cats: DfCatAccessor, codes: "pd.Series" ) -> Tuple[Union[StringArray, ArrayInf], ArrayInf, Tuple]: """Get the array interface representation of pandas category accessor.""" # pandas uses -1 to represent missing values for categorical features codes = codes.replace(-1, np.nan) if np.issubdtype(cats.dtype, np.floating) or np.issubdtype(cats.dtype, np.integer): # Numeric index type name_values_num = cats.values jarr_values = array_interface_dict(name_values_num) code_values = codes.values jarr_codes = array_interface_dict(code_values) return jarr_values, jarr_codes, (name_values_num, code_values) def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]: """Convert a numpy string array to an arrow string array.""" lenarr = np.vectorize(len) offsets = np.cumsum( np.concatenate([np.array([0], dtype=np.int64), lenarr(strarr)]) ) values = strarr.sum() assert "\0" not in values # arrow string array doesn't need null terminal return offsets.astype(np.int32), values # String index type name_offsets, name_values = npstr_to_arrow_strarr(cats.values) name_offsets, _ = _ensure_np_dtype(name_offsets, np.int32) joffsets = array_interface_dict(name_offsets) bvalues = name_values.encode("utf-8") ptr = ctypes.c_void_p.from_buffer(ctypes.c_char_p(bvalues)).value assert ptr is not None jvalues: ArrayInf = { "data": (ptr, True), "typestr": "|i1", "shape": (len(name_values),), "strides": None, "version": 3, "mask": None, } jnames: StringArray = {"offsets": joffsets, "values": jvalues} code_values = codes.values jcodes = array_interface_dict(code_values) buf = ( name_offsets, name_values, bvalues, code_values, ) # store temporary values return jnames, jcodes, buf def array_interface(data: np.ndarray) -> bytes: """Make array interface str.""" interface = array_interface_dict(data) interface_str = bytes(json.dumps(interface), "utf-8") return interface_str def check_cudf_meta(data: _CudaArrayLikeArg, field: str) -> None: "Make sure no missing value in meta data." if ( "mask" in data.__cuda_array_interface__ and data.__cuda_array_interface__["mask"] is not None ): raise ValueError(f"Missing value is not allowed for: {field}") class ArrowSchema(ctypes.Structure): """The Schema type from arrow C array.""" _fields_ = [ ("format", ctypes.c_char_p), ("name", ctypes.c_char_p), ("metadata", ctypes.c_char_p), ("flags", ctypes.c_int64), ("n_children", ctypes.c_int64), ("children", ctypes.POINTER(ctypes.c_void_p)), ("dictionary", ctypes.c_void_p), ("release", ctypes.c_void_p), ("private_data", ctypes.c_void_p), ] class ArrowArray(ctypes.Structure): """The Array type from arrow C array.""" ArrowArray._fields_ = [ # pylint: disable=protected-access ("length", ctypes.c_int64), ("null_count", ctypes.c_int64), ("offset", ctypes.c_int64), ("n_buffers", ctypes.c_int64), ("n_children", ctypes.c_int64), ("buffers", ctypes.POINTER(ctypes.c_void_p)), ("children", ctypes.POINTER(ctypes.POINTER(ArrowArray))), ("dictionary", ctypes.POINTER(ArrowArray)), ("release", ctypes.c_void_p), ("private_data", ctypes.c_void_p), ] class ArrowDeviceArray(ctypes.Structure): """The Array type from arrow C device array.""" _fields_ = [ ("array", ArrowArray), ("device_id", ctypes.c_int64), ("device_type", ctypes.c_int32), ("sync_event", ctypes.c_void_p), ("reserved", ctypes.c_int64 * 3), ] PyCapsule_GetName = ctypes.pythonapi.PyCapsule_GetName PyCapsule_GetName.restype = ctypes.c_char_p PyCapsule_GetName.argtypes = [ctypes.py_object] PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer PyCapsule_GetPointer.restype = ctypes.c_void_p PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p] def wait_event(event_hdl: int) -> None: """Wait for CUDA event exported by arrow.""" # cuda-python is a dependency of cuDF. from cuda.bindings import runtime as cudart event = ctypes.cast(event_hdl, ctypes.POINTER(ctypes.c_int64)) (status,) = cudart.cudaStreamWaitEvent( STREAM_PER_THREAD, event.contents.value, cudart.cudaEventWaitDefault, ) if status != cudart.cudaError_t.cudaSuccess: _, msg = cudart.cudaGetErrorString(status) raise ValueError(msg) def cudf_cat_inf( # pylint: disable=too-many-locals cats: DfCatAccessor, codes: "pd.Series" ) -> Tuple[Union[CudaArrayInf, CudaStringArray], ArrayInf, Tuple]: """Obtain the cuda array interface for cuDF categories.""" cp = import_cupy() is_num_idx = cp.issubdtype(cats.dtype, cp.floating) or cp.issubdtype( cats.dtype, cp.integer ) if is_num_idx: cats_ainf = cuda_array_interface_dict(cats) codes_ainf = cuda_array_interface_dict(codes) return cats_ainf, codes_ainf, (cats, codes) # pylint: disable=protected-access arrow_col = cats._column.to_pylibcudf(mode="read") # Tuple[types.CapsuleType, types.CapsuleType] schema, array = arrow_col.__arrow_c_device_array__() array_ptr = PyCapsule_GetPointer(array, PyCapsule_GetName(array)) schema_ptr = PyCapsule_GetPointer(schema, PyCapsule_GetName(schema)) # Cast to arrow array arrow_device_array = ctypes.cast( array_ptr, ctypes.POINTER(ArrowDeviceArray) ).contents wait_event(arrow_device_array.sync_event) assert arrow_device_array.device_type == 2 # 2 is CUDA arrow_array = arrow_device_array.array mask, offset, data = ( arrow_array.buffers[0], arrow_array.buffers[1], arrow_array.buffers[2], ) # Categories should not have missing values. assert mask is None assert arrow_array.n_children == 0 assert arrow_array.n_buffers == 3 assert arrow_array.offset == 0 # Cast to ArrowSchema arrow_schema = ctypes.cast(schema_ptr, ctypes.POINTER(ArrowSchema)).contents assert arrow_schema.format in (b"u", b"U", b"vu") # utf8, large utf8 if arrow_schema.format in (b"u", b"vu"): joffset: CudaArrayInf = _arrow_buf_inf( offset, " None: # The handle type is a bundle of the handle and the free call. Otherwise, we # will have to import the lib and checkcall inside the __del__ method from the # core module to avoid cyclic model dependency. Importing modules in __del__ can # result in Python abort if __del__ is called during exception handling # (interpreter is shutting down). self._handle, self._free = handle self._arrow_arrays = arrow_arrays def to_arrow(self) -> ArrowCatList: """Get the categories in the dataset. The results are stored in a list of (feature name, arrow array) pairs, with one array for each categorical feature. If a feature is numerical, then the corresponding column in the list is None. A value error will be raised if this container was created without the `export_to_arrow` option. """ if self._arrow_arrays is None: raise ValueError( "The `export_to_arrow` option of the `get_categories` method" " is required." ) return self._arrow_arrays def empty(self) -> bool: """Returns True if there's no category.""" return self._handle.value is None def get_handle(self) -> int: """Internal method for retrieving the handle.""" assert self._handle.value return self._handle.value def __del__(self) -> None: if self._handle.value is None: return self._free() def get_ref_categories( feature_types: Optional[Union[FeatureTypes, Categories]], ) -> Tuple[Optional[FeatureTypes], Optional[Categories]]: """Get the optional reference categories from the `feature_types`. This is used by various `DMatrix` where the `feature_types` is reused for specifying the reference categories. """ if isinstance(feature_types, Categories): ref_categories = feature_types feature_types = None else: ref_categories = None return feature_types, ref_categories # Type schema for storing JSON-encoded array interface AifType: TypeAlias = List[ Union[ # numeric column Union[ArrayInf, CudaArrayInf], # categorical column Tuple[ # (cuda) numeric index | (cuda) string index Union[ArrayInf, CudaArrayInf, StringArray, CudaStringArray], Union[ArrayInf, CudaArrayInf], # codes ], ] ] class TransformedDf(ABC): """Internal class for storing transformed dataframe. Parameters ---------- ref_categories : Optional reference categories used for re-coding. aitfs : Array interface for each column. """ temporary_buffers: List[Tuple] = [] def __init__(self, ref_categories: Optional[Categories], aitfs: AifType) -> None: self.ref_categories = ref_categories if ref_categories is not None and ref_categories.get_handle() is not None: aif = ref_categories.get_handle() self.ref_aif: Optional[int] = aif else: self.ref_aif = None self.aitfs = aitfs def array_interface(self) -> bytes: """Return a byte string for JSON encoded array interface.""" if self.ref_categories is not None: ref_inf: dict = {"ref_categories": self.ref_aif, "columns": self.aitfs} inf = bytes(json.dumps(ref_inf), "utf-8") else: inf = bytes(json.dumps(self.aitfs), "utf-8") return inf @property @abstractmethod def shape(self) -> Tuple[int, int]: """Return the shape of the dataframe."""