1163 lines
34 KiB
Python
1163 lines
34 KiB
Python
# pylint: disable=invalid-name, too-many-lines
|
|
"""Utilities for data generation."""
|
|
import gc
|
|
import multiprocessing
|
|
import os
|
|
import string
|
|
import zipfile
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from dataclasses import dataclass
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
Callable,
|
|
Dict,
|
|
Generator,
|
|
List,
|
|
NamedTuple,
|
|
Optional,
|
|
Sequence,
|
|
Set,
|
|
Tuple,
|
|
Type,
|
|
Union,
|
|
)
|
|
from urllib import request
|
|
|
|
import numpy as np
|
|
import pytest
|
|
from numpy import typing as npt
|
|
from numpy.random import Generator as RNG
|
|
from scipy import sparse
|
|
|
|
from ..core import DataIter, DMatrix, QuantileDMatrix
|
|
from ..data import is_pd_cat_dtype, pandas_pyarrow_mapper
|
|
from ..sklearn import ArrayLike, XGBRanker
|
|
from ..training import train as train_fn
|
|
|
|
if TYPE_CHECKING:
|
|
from pandas import DataFrame as DataFrameT
|
|
else:
|
|
DataFrameT = Any
|
|
|
|
joblib = pytest.importorskip("joblib")
|
|
memory = joblib.Memory("./cachedir", verbose=0)
|
|
|
|
|
|
def np_dtypes(
|
|
n_samples: int, n_features: int
|
|
) -> Generator[Union[Tuple[np.ndarray, np.ndarray], Tuple[list, list]], None, None]:
|
|
"""Enumerate all supported dtypes from numpy."""
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
rng = np.random.RandomState(1994)
|
|
# Integer and float.
|
|
orig = rng.randint(low=0, high=127, size=n_samples * n_features).reshape(
|
|
n_samples, n_features
|
|
)
|
|
dtypes = [
|
|
np.int32,
|
|
np.int64,
|
|
np.byte,
|
|
np.short,
|
|
np.intc,
|
|
np.int_,
|
|
np.longlong,
|
|
np.uint32,
|
|
np.uint64,
|
|
np.ubyte,
|
|
np.ushort,
|
|
np.uintc,
|
|
np.uint,
|
|
np.ulonglong,
|
|
np.float16,
|
|
np.float32,
|
|
np.float64,
|
|
np.half,
|
|
np.single,
|
|
np.double,
|
|
]
|
|
for dtype in dtypes:
|
|
X = np.array(orig, dtype=dtype)
|
|
yield orig, X
|
|
yield orig.tolist(), X.tolist()
|
|
|
|
for dtype in dtypes:
|
|
X = np.array(orig, dtype=dtype)
|
|
df_orig = pd.DataFrame(orig)
|
|
df = pd.DataFrame(X)
|
|
yield df_orig, df
|
|
|
|
# Boolean
|
|
orig = rng.binomial(1, 0.5, size=n_samples * n_features).reshape(
|
|
n_samples, n_features
|
|
)
|
|
for dtype1 in [np.bool_, bool]:
|
|
X = np.array(orig, dtype=dtype1)
|
|
yield orig, X
|
|
|
|
for dtype2 in [np.bool_, bool]:
|
|
X = np.array(orig, dtype=dtype2)
|
|
df_orig = pd.DataFrame(orig)
|
|
df = pd.DataFrame(X)
|
|
yield df_orig, df
|
|
|
|
|
|
def pd_dtypes() -> Generator:
|
|
"""Enumerate all supported pandas extension types."""
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
# Integer
|
|
dtypes = [
|
|
pd.UInt8Dtype(),
|
|
pd.UInt16Dtype(),
|
|
pd.UInt32Dtype(),
|
|
pd.UInt64Dtype(),
|
|
pd.Int8Dtype(),
|
|
pd.Int16Dtype(),
|
|
pd.Int32Dtype(),
|
|
pd.Int64Dtype(),
|
|
]
|
|
|
|
Null: Union[float, None, Any] = np.nan
|
|
orig = pd.DataFrame(
|
|
{"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=np.float32
|
|
)
|
|
for Null in (np.nan, None, pd.NA):
|
|
for dtype in dtypes:
|
|
df = pd.DataFrame(
|
|
{"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=dtype
|
|
)
|
|
yield orig, df
|
|
|
|
# Float
|
|
Null = np.nan
|
|
dtypes = [pd.Float32Dtype(), pd.Float64Dtype()]
|
|
orig = pd.DataFrame(
|
|
{"f0": [1.0, 2.0, Null, 3.0], "f1": [3.0, 2.0, Null, 1.0]}, dtype=np.float32
|
|
)
|
|
for Null in (np.nan, None, pd.NA):
|
|
for dtype in dtypes:
|
|
df = pd.DataFrame(
|
|
{"f0": [1.0, 2.0, Null, 3.0], "f1": [3.0, 2.0, Null, 1.0]}, dtype=dtype
|
|
)
|
|
yield orig, df
|
|
ser_orig = orig["f0"]
|
|
ser = df["f0"]
|
|
assert isinstance(ser, pd.Series)
|
|
assert isinstance(ser_orig, pd.Series)
|
|
yield ser_orig, ser
|
|
|
|
# Categorical
|
|
orig = orig.astype("category")
|
|
for c in orig.columns:
|
|
orig[c] = orig[c].cat.rename_categories(int)
|
|
for Null in (np.nan, None, pd.NA):
|
|
df = pd.DataFrame(
|
|
{"f0": [1, 2, Null, 3], "f1": [3, 2, Null, 1]},
|
|
dtype=pd.CategoricalDtype(),
|
|
)
|
|
yield orig, df
|
|
|
|
# Boolean
|
|
for Null in [None, pd.NA]:
|
|
data = {"f0": [True, False, Null, True], "f1": [False, True, Null, True]}
|
|
# pd.NA is not convertible to bool.
|
|
orig = pd.DataFrame(data, dtype=np.bool_ if Null is None else pd.BooleanDtype())
|
|
df = pd.DataFrame(data, dtype=pd.BooleanDtype())
|
|
yield orig, df
|
|
|
|
|
|
def pd_arrow_dtypes() -> Generator:
|
|
"""Pandas DataFrame with pyarrow backed type."""
|
|
pd = pytest.importorskip("pandas")
|
|
pa = pytest.importorskip("pyarrow")
|
|
|
|
# Integer
|
|
dtypes = pandas_pyarrow_mapper
|
|
# Create a dictionary-backed dataframe, enable this when the roundtrip is
|
|
# implemented in pandas/pyarrow
|
|
#
|
|
# category = pd.ArrowDtype(pa.dictionary(pa.int32(), pa.int32(), ordered=True))
|
|
# df = pd.DataFrame({"f0": [0, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=category)
|
|
|
|
# Error:
|
|
# >>> df.astype("category")
|
|
# Function 'dictionary_encode' has no kernel matching input types
|
|
# (array[dictionary<values=int32, indices=int32, ordered=0>])
|
|
|
|
# Error:
|
|
# pd_cat_df = pd.DataFrame(
|
|
# {"f0": [0, 2, Null, 3], "f1": [4, 3, Null, 1]},
|
|
# dtype="category"
|
|
# )
|
|
# pa_catcodes = (
|
|
# df["f1"].array.__arrow_array__().combine_chunks().to_pandas().cat.codes
|
|
# )
|
|
# pd_catcodes = pd_cat_df["f1"].cat.codes
|
|
# assert pd_catcodes.equals(pa_catcodes)
|
|
|
|
for Null in (None, pd.NA, 0):
|
|
for dtype in dtypes:
|
|
if dtype.startswith("float16") or dtype.startswith("bool"):
|
|
continue
|
|
# Use np.nan is a baseline
|
|
orig_null = Null if not pd.isna(Null) and Null == 0 else np.nan
|
|
orig = pd.DataFrame(
|
|
{"f0": [1, 2, orig_null, 3], "f1": [4, 3, orig_null, 1]},
|
|
dtype=np.float32,
|
|
)
|
|
|
|
df = pd.DataFrame(
|
|
{"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=dtype
|
|
)
|
|
yield orig, df
|
|
|
|
# If Null is `False`, then there's no missing value.
|
|
for Null in (pd.NA, False):
|
|
orig = pd.DataFrame(
|
|
{"f0": [True, False, Null, True], "f1": [False, True, Null, True]},
|
|
dtype=pd.BooleanDtype(),
|
|
)
|
|
df = pd.DataFrame(
|
|
{"f0": [True, False, Null, True], "f1": [False, True, Null, True]},
|
|
dtype=pd.ArrowDtype(pa.bool_()),
|
|
)
|
|
yield orig, df
|
|
|
|
|
|
def check_inf(rng: RNG) -> None:
|
|
"""Validate there's no inf in X."""
|
|
X = rng.random(size=32).reshape(8, 4)
|
|
y = rng.random(size=8)
|
|
X[5, 2] = np.inf
|
|
|
|
with pytest.raises(ValueError, match="Input data contains `inf`"):
|
|
QuantileDMatrix(X, y)
|
|
|
|
with pytest.raises(ValueError, match="Input data contains `inf`"):
|
|
DMatrix(X, y)
|
|
|
|
|
|
@memory.cache
|
|
def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
|
|
"""Synthesize a dataset similar to the sklearn California housing dataset.
|
|
|
|
The real one can be obtained via:
|
|
|
|
.. code-block::
|
|
|
|
import sklearn.datasets
|
|
|
|
X, y = sklearn.datasets.fetch_california_housing(return_X_y=True)
|
|
|
|
"""
|
|
n_samples = 20640
|
|
rng = np.random.default_rng(2025)
|
|
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
def mixture_2comp(
|
|
means: List[float], sigmas: List[float], weights: List[float]
|
|
) -> np.ndarray:
|
|
l0 = rng.normal(
|
|
size=(int(n_samples * weights[0])), loc=means[0], scale=sigmas[0]
|
|
)
|
|
l1 = rng.normal(size=(n_samples - l0.shape[0]), loc=means[1], scale=sigmas[1])
|
|
return np.concatenate([l0, l1], axis=0)
|
|
|
|
def norm(mean: float, std: float) -> np.ndarray:
|
|
return rng.normal(loc=mean, scale=std, size=(n_samples,))
|
|
|
|
df = pd.DataFrame(
|
|
{
|
|
"Longitude": mixture_2comp(
|
|
[-118.0703597, -121.85682825],
|
|
[0.7897320650373969, 0.7248398629412008],
|
|
[0.60402556, 0.39597444],
|
|
),
|
|
"Latitude": mixture_2comp(
|
|
[37.84266317, 33.86030848],
|
|
[1.0643911549736087, 0.5049274656834589],
|
|
[0.44485062, 0.55514938],
|
|
),
|
|
"MedInc": norm(mean=3.8706710029069766, std=1.8997756945748738),
|
|
"HouseAge": norm(mean=28.639486434108527, std=12.585252725724606),
|
|
"AveRooms": norm(mean=5.428999742190376, std=2.474113202333516),
|
|
"AveBedrms": norm(mean=1.096675149606208, std=0.47389937625774475),
|
|
"Population": norm(mean=1425.4767441860465, std=1132.434687757615),
|
|
"AveOccup": norm(mean=3.0706551594363742, std=10.385797959128219),
|
|
"MedHouseVal": norm(mean=2.068558169089147, std=1.1539282040412253),
|
|
}
|
|
)
|
|
X = df[df.columns.difference(["MedHouseVal"])].to_numpy()
|
|
y = df["MedHouseVal"].to_numpy()
|
|
return X, y
|
|
|
|
|
|
@memory.cache
|
|
def get_digits() -> Tuple[np.ndarray, np.ndarray]:
|
|
"""Fetch the digits dataset from sklearn."""
|
|
datasets = pytest.importorskip("sklearn.datasets")
|
|
data = datasets.load_digits()
|
|
return data.data, data.target
|
|
|
|
|
|
@memory.cache
|
|
def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
|
|
"""Fetch the breast cancer dataset from sklearn."""
|
|
datasets = pytest.importorskip("sklearn.datasets")
|
|
return datasets.load_breast_cancer(return_X_y=True)
|
|
|
|
|
|
@memory.cache
|
|
def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
|
|
"""Generate a sparse dataset."""
|
|
datasets = pytest.importorskip("sklearn.datasets")
|
|
rng = np.random.RandomState(199)
|
|
n = 2000
|
|
sparsity = 0.75
|
|
X, y = datasets.make_regression(n, random_state=rng)
|
|
flag = rng.binomial(1, sparsity, X.shape)
|
|
for i in range(X.shape[0]):
|
|
for j in range(X.shape[1]):
|
|
if flag[i, j]:
|
|
X[i, j] = np.nan
|
|
return X, y
|
|
|
|
|
|
# pylint: disable=too-many-statements
|
|
@memory.cache
|
|
def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]:
|
|
"""Get a synthetic version of the amse housing dataset.
|
|
|
|
The real one can be obtained via:
|
|
|
|
.. code-block::
|
|
|
|
from sklearn import datasets
|
|
|
|
datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
|
|
|
|
Number of samples: 1460
|
|
Number of features: 20
|
|
Number of categorical features: 10
|
|
Number of numerical features: 10
|
|
"""
|
|
if TYPE_CHECKING:
|
|
import pandas as pd
|
|
else:
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
rng = np.random.default_rng(1994)
|
|
n_samples = 1460
|
|
df = pd.DataFrame()
|
|
|
|
def synth_cat(
|
|
name_proba: Dict[Union[str, float], float], density: float
|
|
) -> pd.Series:
|
|
n_nulls = int(n_samples * (1 - density))
|
|
has_nan = np.abs(1.0 - density) > 1e-6 and n_nulls > 0
|
|
if has_nan:
|
|
sparsity = 1.0 - density
|
|
name_proba[np.nan] = sparsity
|
|
|
|
keys = list(name_proba.keys())
|
|
p = list(name_proba.values())
|
|
p[-1] += 1.0 - np.sum(p) # Fix floating point error
|
|
x = rng.choice(keys, size=n_samples, p=p)
|
|
|
|
series = pd.Series(
|
|
x,
|
|
dtype=pd.CategoricalDtype(
|
|
# not NA
|
|
filter(lambda x: isinstance(x, str), keys)
|
|
),
|
|
)
|
|
return series
|
|
|
|
df["BldgType"] = synth_cat(
|
|
{
|
|
"1Fam": 0.835616,
|
|
"2fmCon": 0.078082,
|
|
"Duplex": 0.035616,
|
|
"Twnhs": 0.029452,
|
|
"TwnhsE": 0.021233,
|
|
},
|
|
1.0,
|
|
)
|
|
df["GarageFinish"] = synth_cat(
|
|
{"Unf": 0.414384, "RFn": 0.289041, "Fin": 0.241096}, 0.94452
|
|
)
|
|
df["LotConfig"] = synth_cat(
|
|
{
|
|
"Corner": 0.180137,
|
|
"CulDSac": 0.064384,
|
|
"FR2": 0.032192,
|
|
"FR3": 0.002740,
|
|
},
|
|
1.0,
|
|
)
|
|
df["Functional"] = synth_cat(
|
|
{
|
|
"Typ": 0.931506,
|
|
"Min2": 0.023287,
|
|
"Min1": 0.021232,
|
|
"Mod": 0.010273,
|
|
"Maj1": 0.009589,
|
|
"Maj2": 0.003424,
|
|
"Sev": 0.000684,
|
|
},
|
|
1.0,
|
|
)
|
|
df["MasVnrType"] = synth_cat(
|
|
{
|
|
"None": 0.591780,
|
|
"BrkFace": 0.304794,
|
|
"Stone": 0.087671,
|
|
"BrkCmn": 0.010273,
|
|
},
|
|
0.99452,
|
|
)
|
|
df["HouseStyle"] = synth_cat(
|
|
{
|
|
"1Story": 0.497260,
|
|
"2Story": 0.304794,
|
|
"1.5Fin": 0.105479,
|
|
"SLvl": 0.044520,
|
|
"SFoyer": 0.025342,
|
|
"1.5Unf": 0.009589,
|
|
"2.5Unf": 0.007534,
|
|
"2.5Fin": 0.005479,
|
|
},
|
|
1.0,
|
|
)
|
|
df["FireplaceQu"] = synth_cat(
|
|
{
|
|
"Gd": 0.260273,
|
|
"TA": 0.214383,
|
|
"Fa": 0.022602,
|
|
"Ex": 0.016438,
|
|
"Po": 0.013698,
|
|
},
|
|
0.527397,
|
|
)
|
|
df["ExterCond"] = synth_cat(
|
|
{
|
|
"TA": 0.878082,
|
|
"Gd": 0.1,
|
|
"Fa": 0.019178,
|
|
"Ex": 0.002054,
|
|
"Po": 0.000684,
|
|
},
|
|
1.0,
|
|
)
|
|
df["ExterQual"] = synth_cat(
|
|
{
|
|
"TA": 0.620547,
|
|
"Gd": 0.334246,
|
|
"Ex": 0.035616,
|
|
"Fa": 0.009589,
|
|
},
|
|
1.0,
|
|
)
|
|
df["PoolQC"] = synth_cat(
|
|
{
|
|
"Gd": 0.002054,
|
|
"Ex": 0.001369,
|
|
"Fa": 0.001369,
|
|
},
|
|
0.004794,
|
|
)
|
|
|
|
# We focus on the cateogircal values here, for numerical features, simple normal
|
|
# distribution is used, which doesn't match the original data.
|
|
def synth_num(loc: float, std: float, density: float) -> pd.Series:
|
|
x = rng.normal(loc=loc, scale=std, size=n_samples)
|
|
n_nulls = int(n_samples * (1 - density))
|
|
if np.abs(1.0 - density) > 1e-6 and n_nulls > 0:
|
|
null_idx = rng.choice(n_samples, size=n_nulls, replace=False)
|
|
x[null_idx] = np.nan
|
|
return pd.Series(x, dtype=np.float64)
|
|
|
|
df["3SsnPorch"] = synth_num(3.4095890410958902, 29.31733055678188, 1.0)
|
|
df["Fireplaces"] = synth_num(0.613013698630137, 0.6446663863122295, 1.0)
|
|
df["BsmtHalfBath"] = synth_num(0.057534246575342465, 0.23875264627921178, 1.0)
|
|
df["HalfBath"] = synth_num(0.38287671232876713, 0.5028853810928914, 1.0)
|
|
df["GarageCars"] = synth_num(1.7671232876712328, 0.7473150101111095, 1.0)
|
|
df["TotRmsAbvGrd"] = synth_num(6.517808219178082, 1.6253932905840505, 1.0)
|
|
df["BsmtFinSF1"] = synth_num(443.6397260273973, 456.0980908409277, 1.0)
|
|
df["BsmtFinSF2"] = synth_num(46.54931506849315, 161.31927280654173, 1.0)
|
|
df["GrLivArea"] = synth_num(1515.463698630137, 525.4803834232025, 1.0)
|
|
df["ScreenPorch"] = synth_num(15.060958904109588, 55.757415281874174, 1.0)
|
|
|
|
columns = list(df.columns)
|
|
rng.shuffle(columns)
|
|
df = df[columns]
|
|
|
|
# linear interaction for testing purposes.
|
|
y = np.zeros(shape=(n_samples,))
|
|
for c in df.columns:
|
|
if isinstance(df[c].dtype, pd.CategoricalDtype):
|
|
y += df[c].cat.codes.astype(np.float64)
|
|
else:
|
|
y += df[c].values
|
|
|
|
# Shift and scale to match the original y.
|
|
y *= 79442.50288288662 / y.std()
|
|
y += 180921.19589041095 - y.mean()
|
|
|
|
return df, y
|
|
|
|
|
|
@memory.cache
|
|
def get_mq2008(
|
|
dpath: str,
|
|
) -> Tuple[
|
|
sparse.csr_matrix,
|
|
np.ndarray,
|
|
np.ndarray,
|
|
sparse.csr_matrix,
|
|
np.ndarray,
|
|
np.ndarray,
|
|
sparse.csr_matrix,
|
|
np.ndarray,
|
|
np.ndarray,
|
|
]:
|
|
"""Fetch the mq2008 dataset."""
|
|
datasets = pytest.importorskip("sklearn.datasets")
|
|
src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
|
|
target = os.path.join(dpath, "MQ2008.zip")
|
|
if not os.path.exists(target):
|
|
request.urlretrieve(url=src, filename=target)
|
|
|
|
with zipfile.ZipFile(target, "r") as f:
|
|
f.extractall(path=dpath)
|
|
|
|
(
|
|
x_train,
|
|
y_train,
|
|
qid_train,
|
|
x_test,
|
|
y_test,
|
|
qid_test,
|
|
x_valid,
|
|
y_valid,
|
|
qid_valid,
|
|
) = datasets.load_svmlight_files(
|
|
(
|
|
os.path.join(dpath, "MQ2008/Fold1/train.txt"),
|
|
os.path.join(dpath, "MQ2008/Fold1/test.txt"),
|
|
os.path.join(dpath, "MQ2008/Fold1/vali.txt"),
|
|
),
|
|
query_id=True,
|
|
zero_based=False,
|
|
)
|
|
|
|
return (
|
|
x_train,
|
|
y_train,
|
|
qid_train,
|
|
x_test,
|
|
y_test,
|
|
qid_test,
|
|
x_valid,
|
|
y_valid,
|
|
qid_valid,
|
|
)
|
|
|
|
|
|
def make_batches( # pylint: disable=too-many-arguments,too-many-locals
|
|
n_samples_per_batch: int,
|
|
n_features: int,
|
|
n_batches: int,
|
|
use_cupy: bool = False,
|
|
*,
|
|
vary_size: bool = False,
|
|
random_state: int = 1994,
|
|
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
|
|
"""Make batches of dense data."""
|
|
X = []
|
|
y = []
|
|
w = []
|
|
if use_cupy:
|
|
import cupy
|
|
|
|
rng = cupy.random.RandomState(np.uint64(random_state))
|
|
else:
|
|
rng = np.random.RandomState(random_state)
|
|
for i in range(n_batches):
|
|
n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
|
|
_X = rng.randn(n_samples, n_features)
|
|
_y = rng.randn(n_samples)
|
|
_w = rng.uniform(low=0, high=1, size=n_samples)
|
|
X.append(_X)
|
|
y.append(_y)
|
|
w.append(_w)
|
|
return X, y, w
|
|
|
|
|
|
RelData = Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]]
|
|
|
|
|
|
@dataclass
|
|
class ClickFold:
|
|
"""A structure containing information about generated user-click data."""
|
|
|
|
X: sparse.csr_matrix
|
|
y: npt.NDArray[np.int32]
|
|
qid: npt.NDArray[np.int32]
|
|
score: npt.NDArray[np.float32]
|
|
click: npt.NDArray[np.int32]
|
|
pos: npt.NDArray[np.int64]
|
|
|
|
|
|
class RelDataCV(NamedTuple):
|
|
"""Simple data struct for holding a train-test split of a learning to rank dataset."""
|
|
|
|
train: RelData
|
|
test: RelData
|
|
max_rel: int
|
|
|
|
def is_binary(self) -> bool:
|
|
"""Whether the label consists of binary relevance degree."""
|
|
return self.max_rel == 1
|
|
|
|
|
|
class PBM:
|
|
"""Simulate click data with position bias model. There are other models available in
|
|
`ULTRA <https://github.com/ULTR-Community/ULTRA.git>`_ like the cascading model.
|
|
|
|
References
|
|
----------
|
|
Unbiased LambdaMART: An Unbiased Pairwise Learning-to-Rank Algorithm
|
|
|
|
"""
|
|
|
|
def __init__(self, eta: float) -> None:
|
|
# click probability for each relevance degree. (from 0 to 4)
|
|
self.click_prob = np.array([0.1, 0.16, 0.28, 0.52, 1.0])
|
|
exam_prob = np.array(
|
|
[0.68, 0.61, 0.48, 0.34, 0.28, 0.20, 0.11, 0.10, 0.08, 0.06]
|
|
)
|
|
# Observation probability, encoding positional bias for each position
|
|
self.exam_prob = np.power(exam_prob, eta)
|
|
|
|
def sample_clicks_for_query(
|
|
self, labels: npt.NDArray[np.int32], position: npt.NDArray[np.int64]
|
|
) -> npt.NDArray[np.int32]:
|
|
"""Sample clicks for one query based on input relevance degree and position.
|
|
|
|
Parameters
|
|
----------
|
|
|
|
labels :
|
|
relevance_degree
|
|
|
|
"""
|
|
labels = np.array(labels, copy=True)
|
|
|
|
click_prob = np.zeros(labels.shape)
|
|
# minimum
|
|
labels[labels < 0] = 0
|
|
# maximum
|
|
labels[labels >= len(self.click_prob)] = -1
|
|
click_prob = self.click_prob[labels]
|
|
|
|
exam_prob = np.zeros(labels.shape)
|
|
assert position.size == labels.size
|
|
ranks = np.array(position, copy=True)
|
|
# maximum
|
|
ranks[ranks >= self.exam_prob.size] = -1
|
|
exam_prob = self.exam_prob[ranks]
|
|
|
|
rng = np.random.default_rng(1994)
|
|
prob = rng.random(size=labels.shape[0], dtype=np.float32)
|
|
|
|
clicks: npt.NDArray[np.int32] = np.zeros(labels.shape, dtype=np.int32)
|
|
clicks[prob < exam_prob * click_prob] = 1
|
|
return clicks
|
|
|
|
|
|
def rlencode(x: npt.NDArray[np.int32]) -> Tuple[npt.NDArray, npt.NDArray, npt.NDArray]:
|
|
"""Run length encoding using numpy, modified from:
|
|
https://gist.github.com/nvictus/66627b580c13068589957d6ab0919e66
|
|
|
|
"""
|
|
x = np.asarray(x)
|
|
n = x.size
|
|
starts = np.r_[0, np.flatnonzero(~np.isclose(x[1:], x[:-1], equal_nan=True)) + 1]
|
|
lengths = np.diff(np.r_[starts, n])
|
|
values = x[starts]
|
|
indptr = np.append(starts, np.array([x.size]))
|
|
|
|
return indptr, lengths, values
|
|
|
|
|
|
def init_rank_score(
|
|
X: sparse.csr_matrix,
|
|
y: npt.NDArray[np.int32],
|
|
qid: npt.NDArray[np.int32],
|
|
sample_rate: float = 0.1,
|
|
) -> npt.NDArray[np.float32]:
|
|
"""We use XGBoost to generate the initial score instead of SVMRank for
|
|
simplicity. Sample rate is set to 0.1 by default so that we can test with small
|
|
datasets.
|
|
|
|
"""
|
|
# random sample
|
|
rng = np.random.default_rng(1994)
|
|
n_samples = int(X.shape[0] * sample_rate)
|
|
index: npt.NDArray = np.arange(0, X.shape[0], dtype=np.uint64)
|
|
rng.shuffle(index)
|
|
index = index[:n_samples]
|
|
|
|
X_train = X[index]
|
|
y_train = y[index]
|
|
qid_train = qid[index]
|
|
|
|
# Sort training data based on query id, required by XGBoost.
|
|
sorted_idx = np.argsort(qid_train)
|
|
X_train = X_train[sorted_idx]
|
|
y_train = y_train[sorted_idx]
|
|
qid_train = qid_train[sorted_idx]
|
|
|
|
ltr = XGBRanker(objective="rank:ndcg", tree_method="hist")
|
|
ltr.fit(X_train, y_train, qid=qid_train)
|
|
|
|
# Use the original order of the data.
|
|
scores = ltr.predict(X)
|
|
return scores
|
|
|
|
|
|
def simulate_one_fold(
|
|
fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]],
|
|
scores_fold: npt.NDArray[np.float32],
|
|
) -> ClickFold:
|
|
"""Simulate clicks for one fold."""
|
|
X_fold, y_fold, qid_fold = fold
|
|
assert qid_fold.dtype == np.int32
|
|
|
|
qids = np.unique(qid_fold)
|
|
|
|
position = np.empty((y_fold.size,), dtype=np.int64)
|
|
clicks = np.empty((y_fold.size,), dtype=np.int32)
|
|
pbm = PBM(eta=1.0)
|
|
|
|
# Avoid grouping by qid as we want to preserve the original data partition by
|
|
# the dataset authors.
|
|
for q in qids:
|
|
qid_mask = q == qid_fold
|
|
qid_mask = qid_mask.reshape(qid_mask.shape[0])
|
|
query_scores = scores_fold[qid_mask]
|
|
# Initial rank list, scores sorted to decreasing order
|
|
query_position = np.argsort(query_scores)[::-1]
|
|
position[qid_mask] = query_position
|
|
# get labels
|
|
relevance_degrees = y_fold[qid_mask]
|
|
query_clicks = pbm.sample_clicks_for_query(relevance_degrees, query_position)
|
|
clicks[qid_mask] = query_clicks
|
|
|
|
assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape)
|
|
assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape)
|
|
|
|
return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position)
|
|
|
|
|
|
# pylint: disable=too-many-locals
|
|
def simulate_clicks(cv_data: RelDataCV) -> Tuple[ClickFold, Optional[ClickFold]]:
|
|
"""Simulate click data using position biased model (PBM)."""
|
|
X, y, qid = list(zip(cv_data.train, cv_data.test))
|
|
|
|
# ptr to train-test split
|
|
indptr = np.array([0] + [v.shape[0] for v in X])
|
|
indptr = np.cumsum(indptr)
|
|
|
|
assert len(indptr) == 2 + 1 # train, test
|
|
X_full = sparse.vstack(X)
|
|
y_full = np.concatenate(y)
|
|
qid_full = np.concatenate(qid)
|
|
|
|
# Obtain initial relevance score for click simulation
|
|
scores_full = init_rank_score(X_full, y_full, qid_full)
|
|
# partition it back to (train, test) tuple
|
|
scores = [scores_full[indptr[i - 1] : indptr[i]] for i in range(1, indptr.size)]
|
|
|
|
X_lst, y_lst, q_lst, s_lst, c_lst, p_lst = [], [], [], [], [], []
|
|
for i in range(indptr.size - 1):
|
|
fold = simulate_one_fold((X[i], y[i], qid[i]), scores[i])
|
|
X_lst.append(fold.X)
|
|
y_lst.append(fold.y)
|
|
q_lst.append(fold.qid)
|
|
s_lst.append(fold.score)
|
|
c_lst.append(fold.click)
|
|
p_lst.append(fold.pos)
|
|
|
|
scores_check_1 = [s_lst[i] for i in range(indptr.size - 1)]
|
|
for i in range(2):
|
|
assert (scores_check_1[i] == scores[i]).all()
|
|
|
|
if len(X_lst) == 1:
|
|
train = ClickFold(X_lst[0], y_lst[0], q_lst[0], s_lst[0], c_lst[0], p_lst[0])
|
|
test = None
|
|
else:
|
|
train, test = (
|
|
ClickFold(X_lst[i], y_lst[i], q_lst[i], s_lst[i], c_lst[i], p_lst[i])
|
|
for i in range(len(X_lst))
|
|
)
|
|
return train, test
|
|
|
|
|
|
def sort_ltr_samples(
|
|
X: sparse.csr_matrix,
|
|
y: npt.NDArray[np.int32],
|
|
qid: npt.NDArray[np.int32],
|
|
clicks: npt.NDArray[np.int32],
|
|
pos: npt.NDArray[np.int64],
|
|
) -> Tuple[
|
|
sparse.csr_matrix,
|
|
npt.NDArray[np.int32],
|
|
npt.NDArray[np.int32],
|
|
npt.NDArray[np.int32],
|
|
]:
|
|
"""Sort data based on query index and position."""
|
|
sorted_idx = np.argsort(qid)
|
|
X = X[sorted_idx]
|
|
clicks = clicks[sorted_idx]
|
|
qid = qid[sorted_idx]
|
|
pos = pos[sorted_idx]
|
|
|
|
indptr, _, _ = rlencode(qid)
|
|
|
|
for i in range(1, indptr.size):
|
|
beg = indptr[i - 1]
|
|
end = indptr[i]
|
|
|
|
assert beg < end, (beg, end)
|
|
assert np.unique(qid[beg:end]).size == 1, (beg, end)
|
|
|
|
query_pos = pos[beg:end]
|
|
assert query_pos.min() == 0, query_pos.min()
|
|
assert query_pos.max() >= query_pos.size - 1, (
|
|
query_pos.max(),
|
|
query_pos.size,
|
|
i,
|
|
np.unique(qid[beg:end]),
|
|
)
|
|
sorted_idx = np.argsort(query_pos)
|
|
|
|
X[beg:end] = X[beg:end][sorted_idx]
|
|
clicks[beg:end] = clicks[beg:end][sorted_idx]
|
|
y[beg:end] = y[beg:end][sorted_idx]
|
|
# not necessary
|
|
qid[beg:end] = qid[beg:end][sorted_idx]
|
|
|
|
data = X, clicks, y, qid
|
|
|
|
return data
|
|
|
|
|
|
def run_base_margin_info(DType: Callable, DMatrixT: Type[DMatrix], device: str) -> None:
|
|
"""Run tests for base margin."""
|
|
rng = np.random.default_rng()
|
|
X = DType(rng.normal(0, 1.0, size=100).astype(np.float32).reshape(50, 2))
|
|
if hasattr(X, "iloc"):
|
|
y = X.iloc[:, 0]
|
|
else:
|
|
y = X[:, 0]
|
|
base_margin = X
|
|
# no error at set
|
|
Xy = DMatrixT(X, y, base_margin=base_margin)
|
|
# Error at train, caused by check in predictor.
|
|
with pytest.raises(ValueError, match=r".*base_margin.*"):
|
|
train_fn({"tree_method": "hist", "device": device}, Xy)
|
|
|
|
if not hasattr(X, "iloc"):
|
|
# column major matrix
|
|
got = DType(Xy.get_base_margin().reshape(50, 2))
|
|
assert (got == base_margin).all()
|
|
|
|
assert base_margin.T.flags.c_contiguous is False
|
|
assert base_margin.T.flags.f_contiguous is True
|
|
Xy.set_info(base_margin=base_margin.T)
|
|
got = DType(Xy.get_base_margin().reshape(2, 50))
|
|
assert (got == base_margin.T).all()
|
|
|
|
# Row vs col vec.
|
|
base_margin = y
|
|
Xy.set_base_margin(base_margin)
|
|
bm_col = Xy.get_base_margin()
|
|
Xy.set_base_margin(base_margin.reshape(1, base_margin.size))
|
|
bm_row = Xy.get_base_margin()
|
|
assert (bm_row == bm_col).all()
|
|
|
|
# type
|
|
base_margin = base_margin.astype(np.float64)
|
|
Xy.set_base_margin(base_margin)
|
|
bm_f64 = Xy.get_base_margin()
|
|
assert (bm_f64 == bm_col).all()
|
|
|
|
# too many dimensions
|
|
base_margin = X.reshape(2, 5, 2, 5)
|
|
with pytest.raises(ValueError, match=r".*base_margin.*"):
|
|
Xy.set_base_margin(base_margin)
|
|
|
|
|
|
# pylint: disable=too-many-locals
|
|
@memory.cache
|
|
def make_sparse_regression(
|
|
n_samples: int, n_features: int, sparsity: float, as_dense: bool
|
|
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
|
|
"""Make sparse matrix.
|
|
|
|
Parameters
|
|
----------
|
|
|
|
as_dense:
|
|
|
|
Return the matrix as np.ndarray with missing values filled by NaN
|
|
|
|
"""
|
|
if not hasattr(np.random, "default_rng"):
|
|
rng = np.random.RandomState(1994)
|
|
X = sparse.random(
|
|
m=n_samples,
|
|
n=n_features,
|
|
density=1.0 - sparsity,
|
|
random_state=rng,
|
|
format="csr",
|
|
)
|
|
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
|
|
return X, y
|
|
|
|
# Use multi-thread to speed up the generation, convenient if you use this function
|
|
# for benchmarking.
|
|
n_threads = min(multiprocessing.cpu_count(), n_features)
|
|
|
|
def random_csc(t_id: int) -> sparse.csc_matrix:
|
|
rng = np.random.default_rng(1994 * t_id)
|
|
thread_size = n_features // n_threads
|
|
if t_id == n_threads - 1:
|
|
n_features_tloc = n_features - t_id * thread_size
|
|
else:
|
|
n_features_tloc = thread_size
|
|
|
|
X = sparse.random(
|
|
m=n_samples,
|
|
n=n_features_tloc,
|
|
density=1.0 - sparsity,
|
|
random_state=rng,
|
|
).tocsc()
|
|
y = np.zeros((n_samples, 1))
|
|
|
|
for i in range(X.shape[1]):
|
|
size = X.indptr[i + 1] - X.indptr[i]
|
|
if size != 0:
|
|
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
|
|
|
|
return X, y
|
|
|
|
futures = []
|
|
with ThreadPoolExecutor(max_workers=n_threads) as executor:
|
|
for i in range(n_threads):
|
|
futures.append(executor.submit(random_csc, i))
|
|
|
|
X_results = []
|
|
y_results = []
|
|
for f in futures:
|
|
X, y = f.result()
|
|
X_results.append(X)
|
|
y_results.append(y)
|
|
|
|
assert len(y_results) == n_threads
|
|
|
|
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
|
|
y = np.asarray(y_results)
|
|
y = y.reshape((y.shape[0], y.shape[1])).T
|
|
y = np.sum(y, axis=1)
|
|
|
|
assert csr.shape[0] == n_samples
|
|
assert csr.shape[1] == n_features
|
|
assert y.shape[0] == n_samples
|
|
|
|
if as_dense:
|
|
arr = csr.toarray()
|
|
assert arr.shape[0] == n_samples
|
|
assert arr.shape[1] == n_features
|
|
arr[arr == 0] = np.nan
|
|
return arr, y
|
|
|
|
return csr, y
|
|
|
|
|
|
def unique_random_strings(n_strings: int, seed: int) -> List[str]:
|
|
"""Generate n unique strings."""
|
|
name_len = 8 # hardcoded, should be more than enough
|
|
unique_strings: Set[str] = set()
|
|
rng = np.random.default_rng(seed)
|
|
|
|
while len(unique_strings) < n_strings:
|
|
random_str = "".join(
|
|
rng.choice(list(string.ascii_letters), size=name_len, replace=True)
|
|
)
|
|
unique_strings.add(random_str)
|
|
|
|
return list(unique_strings)
|
|
|
|
|
|
# pylint: disable=too-many-arguments,too-many-locals,too-many-branches
|
|
def make_categorical(
|
|
n_samples: int,
|
|
n_features: int,
|
|
n_categories: int,
|
|
*,
|
|
onehot: bool,
|
|
sparsity: float = 0.0,
|
|
cat_ratio: float = 1.0,
|
|
shuffle: bool = False,
|
|
random_state: int = 1994,
|
|
cat_dtype: np.typing.DTypeLike = np.int64,
|
|
device: str = "cpu",
|
|
) -> Tuple[ArrayLike, np.ndarray]:
|
|
"""Generate categorical features for test.
|
|
|
|
Parameters
|
|
----------
|
|
n_categories:
|
|
Number of categories for categorical features.
|
|
onehot:
|
|
Should we apply one-hot encoding to the data?
|
|
sparsity:
|
|
The ratio of the amount of missing values over the number of all entries.
|
|
cat_ratio:
|
|
The ratio of features that are categorical.
|
|
shuffle:
|
|
Whether we should shuffle the columns.
|
|
cat_dtype :
|
|
The dtype for categorical features, might be string or numeric.
|
|
|
|
Returns
|
|
-------
|
|
X, y
|
|
"""
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
# Use different rngs for column and rows. We can change the `n_samples` without
|
|
# changing the column type.
|
|
rng = np.random.RandomState(random_state)
|
|
row_rng = np.random.RandomState(random_state + 1)
|
|
|
|
df = pd.DataFrame()
|
|
for i in range(n_features):
|
|
choice = rng.binomial(1, cat_ratio, size=1)[0]
|
|
if choice == 1:
|
|
if np.issubdtype(cat_dtype, np.str_):
|
|
# we rely on using the feature index as the seed to generate the same
|
|
# categories for multiple calls to `make_categorical`.
|
|
categories = np.array(unique_random_strings(n_categories, i))
|
|
c = row_rng.choice(categories, size=n_samples, replace=True)
|
|
else:
|
|
categories = np.arange(0, n_categories)
|
|
c = row_rng.randint(low=0, high=n_categories, size=n_samples)
|
|
|
|
df[str(i)] = pd.Series(c, dtype="category")
|
|
df[str(i)] = df[str(i)].cat.set_categories(categories)
|
|
else:
|
|
num = row_rng.randint(low=0, high=n_categories, size=n_samples)
|
|
df[str(i)] = pd.Series(num, dtype=num.dtype)
|
|
|
|
label = np.zeros(shape=(n_samples,))
|
|
for col in df.columns:
|
|
if isinstance(df[col].dtype, pd.CategoricalDtype):
|
|
label += df[col].cat.codes
|
|
else:
|
|
label += df[col]
|
|
label += 1
|
|
|
|
if sparsity > 0.0:
|
|
for i in range(n_features):
|
|
index = row_rng.randint(
|
|
low=0, high=n_samples - 1, size=int(n_samples * sparsity)
|
|
)
|
|
df.iloc[index, i] = np.nan
|
|
if is_pd_cat_dtype(df.dtypes.iloc[i]):
|
|
assert n_categories == np.unique(df.dtypes.iloc[i].categories).size
|
|
|
|
assert df.shape[1] == n_features
|
|
if onehot:
|
|
df = pd.get_dummies(df)
|
|
|
|
if shuffle:
|
|
columns = list(df.columns)
|
|
row_rng.shuffle(columns)
|
|
df = df[columns]
|
|
|
|
if device != "cpu":
|
|
assert device in ["cuda", "gpu"]
|
|
import cudf
|
|
import cupy
|
|
|
|
df = cudf.from_pandas(df)
|
|
label = cupy.array(label)
|
|
return df, label
|
|
|
|
|
|
class IteratorForTest(DataIter):
|
|
"""Iterator for testing streaming DMatrix. (external memory, quantile)"""
|
|
|
|
def __init__( # pylint: disable=too-many-arguments
|
|
self,
|
|
X: Sequence,
|
|
y: Sequence,
|
|
w: Optional[Sequence],
|
|
*,
|
|
cache: Optional[str],
|
|
on_host: bool = False,
|
|
min_cache_page_bytes: Optional[int] = None,
|
|
) -> None:
|
|
assert len(X) == len(y)
|
|
self.X = X
|
|
self.y = y
|
|
self.w = w
|
|
self.it = 0
|
|
super().__init__(
|
|
cache_prefix=cache,
|
|
on_host=on_host,
|
|
min_cache_page_bytes=min_cache_page_bytes,
|
|
)
|
|
|
|
def next(self, input_data: Callable) -> bool:
|
|
if self.it == len(self.X):
|
|
return False
|
|
|
|
with pytest.raises(TypeError, match="Keyword argument"):
|
|
input_data(self.X[self.it], self.y[self.it], None)
|
|
|
|
# Use copy to make sure the iterator doesn't hold a reference to the data.
|
|
input_data(
|
|
data=self.X[self.it].copy(),
|
|
label=self.y[self.it].copy(),
|
|
weight=self.w[self.it].copy() if self.w else None,
|
|
)
|
|
gc.collect() # clear up the copy, see if XGBoost access freed memory.
|
|
self.it += 1
|
|
return True
|
|
|
|
def reset(self) -> None:
|
|
self.it = 0
|
|
|
|
def as_arrays(
|
|
self,
|
|
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
|
|
"""Return concatenated arrays."""
|
|
if isinstance(self.X[0], sparse.csr_matrix):
|
|
X = sparse.vstack(self.X, format="csr")
|
|
else:
|
|
X = np.concatenate(self.X, axis=0)
|
|
y = np.concatenate(self.y, axis=0)
|
|
if self.w:
|
|
w = np.concatenate(self.w, axis=0)
|
|
else:
|
|
w = None
|
|
return X, y, w
|