232 lines
6.9 KiB
Python
232 lines
6.9 KiB
Python
# pylint: disable=invalid-name, too-many-arguments, too-many-positional-arguments
|
|
"""Tests for compatiblity with sklearn."""
|
|
|
|
from typing import Callable, Optional, Type
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from ..core import DMatrix
|
|
from ..sklearn import XGBClassifier, XGBRegressor, XGBRFRegressor
|
|
from .data import get_california_housing, make_batches
|
|
from .ordinal import make_recoded
|
|
from .utils import Device, assert_allclose
|
|
|
|
|
|
def run_boost_from_prediction_binary(
|
|
tree_method: str,
|
|
device: Device,
|
|
X: np.ndarray,
|
|
y: np.ndarray,
|
|
as_frame: Optional[Callable],
|
|
) -> None:
|
|
"""
|
|
Parameters
|
|
----------
|
|
|
|
as_frame: A callable function to convert margin into DataFrame, useful for different
|
|
df implementations.
|
|
"""
|
|
|
|
model_0 = XGBClassifier(
|
|
learning_rate=0.3,
|
|
random_state=0,
|
|
n_estimators=4,
|
|
tree_method=tree_method,
|
|
device=device,
|
|
)
|
|
model_0.fit(X=X, y=y)
|
|
margin = model_0.predict(X, output_margin=True)
|
|
if as_frame is not None:
|
|
margin = as_frame(margin)
|
|
|
|
model_1 = XGBClassifier(
|
|
learning_rate=0.3,
|
|
random_state=0,
|
|
n_estimators=4,
|
|
tree_method=tree_method,
|
|
device=device,
|
|
)
|
|
model_1.fit(X=X, y=y, base_margin=margin)
|
|
predictions_1 = model_1.predict(X, base_margin=margin)
|
|
|
|
cls_2 = XGBClassifier(
|
|
learning_rate=0.3,
|
|
random_state=0,
|
|
n_estimators=8,
|
|
tree_method=tree_method,
|
|
device=device,
|
|
)
|
|
cls_2.fit(X=X, y=y)
|
|
predictions_2 = cls_2.predict(X)
|
|
np.testing.assert_allclose(predictions_1, predictions_2)
|
|
|
|
|
|
def run_boost_from_prediction_multi_clasas(
|
|
estimator: Type,
|
|
tree_method: str,
|
|
device: Device,
|
|
X: np.ndarray,
|
|
y: np.ndarray,
|
|
as_frame: Optional[Callable],
|
|
) -> None:
|
|
"""Boosting from prediction with multi-class clf."""
|
|
# Multi-class
|
|
model_0 = estimator(
|
|
learning_rate=0.3,
|
|
random_state=0,
|
|
n_estimators=4,
|
|
tree_method=tree_method,
|
|
device=device,
|
|
)
|
|
model_0.fit(X=X, y=y)
|
|
margin = model_0.get_booster().inplace_predict(X, predict_type="margin")
|
|
if as_frame is not None:
|
|
margin = as_frame(margin)
|
|
|
|
model_1 = estimator(
|
|
learning_rate=0.3,
|
|
random_state=0,
|
|
n_estimators=4,
|
|
tree_method=tree_method,
|
|
device=device,
|
|
)
|
|
model_1.fit(X=X, y=y, base_margin=margin)
|
|
predictions_1 = model_1.get_booster().predict(
|
|
DMatrix(X, base_margin=margin), output_margin=True
|
|
)
|
|
|
|
model_2 = estimator(
|
|
learning_rate=0.3,
|
|
random_state=0,
|
|
n_estimators=8,
|
|
tree_method=tree_method,
|
|
device=device,
|
|
)
|
|
model_2.fit(X=X, y=y)
|
|
predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin")
|
|
|
|
if hasattr(predictions_1, "get"):
|
|
predictions_1 = predictions_1.get()
|
|
if hasattr(predictions_2, "get"):
|
|
predictions_2 = predictions_2.get()
|
|
np.testing.assert_allclose(predictions_1, predictions_2, atol=1e-6)
|
|
|
|
|
|
def run_housing_rf_regression(tree_method: str, device: Device) -> None:
|
|
"""Testwith the cali housing dataset."""
|
|
from sklearn.metrics import mean_squared_error
|
|
from sklearn.model_selection import KFold
|
|
|
|
X, y = get_california_housing()
|
|
rng = np.random.RandomState(1994)
|
|
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
|
for train_index, test_index in kf.split(X, y):
|
|
xgb_model = XGBRFRegressor(
|
|
random_state=42, tree_method=tree_method, device=device
|
|
).fit(X[train_index], y[train_index])
|
|
preds = xgb_model.predict(X[test_index])
|
|
labels = y[test_index]
|
|
assert mean_squared_error(preds, labels) < 35
|
|
|
|
rfreg = XGBRFRegressor(device=device)
|
|
with pytest.raises(NotImplementedError):
|
|
rfreg.set_params(early_stopping_rounds=10)
|
|
rfreg.fit(X, y)
|
|
|
|
|
|
def run_recoding(device: Device) -> None:
|
|
"""Test re-coding for training continuation."""
|
|
enc, reenc, y, _, _ = make_recoded(device, n_features=16)
|
|
reg = XGBRegressor(enable_categorical=True, n_estimators=2, device=device)
|
|
reg.fit(enc, y, eval_set=[(reenc, y)])
|
|
results_0 = reg.evals_result()
|
|
|
|
booster = reg.get_booster()
|
|
assert not booster.get_categories().empty()
|
|
|
|
reg = XGBRegressor(enable_categorical=True, n_estimators=2, device=device)
|
|
reg.fit(reenc, y, xgb_model=booster, eval_set=[(enc, y)])
|
|
results_1 = reg.evals_result()
|
|
|
|
booster = reg.get_booster()
|
|
assert booster.num_boosted_rounds() == 4
|
|
assert not booster.get_categories().empty()
|
|
|
|
reg = XGBRegressor(enable_categorical=True, n_estimators=4, device=device)
|
|
reg.fit(enc, y, eval_set=[(reenc, y)])
|
|
results_2 = reg.evals_result()
|
|
|
|
np.testing.assert_allclose(
|
|
results_2["validation_0"]["rmse"],
|
|
results_0["validation_0"]["rmse"] + results_1["validation_0"]["rmse"],
|
|
)
|
|
|
|
np.testing.assert_allclose(reg.predict(reenc), reg.predict(enc))
|
|
np.testing.assert_allclose(reg.apply(reenc), reg.apply(enc))
|
|
|
|
|
|
def run_intercept(device: Device) -> None:
|
|
"""Tests for the intercept."""
|
|
from sklearn.datasets import make_classification, make_multilabel_classification
|
|
|
|
X, y, w = [v[0] for v in make_batches(256, 3, 1, use_cupy=False)]
|
|
reg = XGBRegressor(device=device)
|
|
reg.fit(X, y, sample_weight=w)
|
|
result = reg.intercept_
|
|
assert result.dtype == np.float32
|
|
assert result[0] < 0.5
|
|
|
|
reg = XGBRegressor(booster="gblinear", device=device)
|
|
reg.fit(X, y, sample_weight=w)
|
|
result = reg.intercept_
|
|
assert isinstance(result, np.ndarray)
|
|
assert result.dtype == np.float32
|
|
assert result[0] < 0.5
|
|
|
|
n_classes = 4
|
|
X, y = make_classification(
|
|
random_state=1994,
|
|
n_samples=128,
|
|
n_features=16,
|
|
n_classes=n_classes,
|
|
n_informative=16,
|
|
n_redundant=0,
|
|
)
|
|
|
|
clf = XGBClassifier(booster="gbtree", objective="multi:softprob", device=device)
|
|
clf.fit(X, y)
|
|
result = clf.intercept_
|
|
assert isinstance(result, np.ndarray)
|
|
assert len(result) == 4
|
|
assert (result >= 0.0).all()
|
|
np.testing.assert_allclose(sum(result), 1.0)
|
|
|
|
# Tests for user input
|
|
# Multi-class
|
|
intercept = np.ones(shape=(n_classes), dtype=np.float32) / n_classes
|
|
if device == "cuda":
|
|
import cupy as cp
|
|
|
|
intercept = cp.array(intercept)
|
|
|
|
clf = XGBClassifier(objective="multi:softprob", base_score=intercept)
|
|
clf.fit(X, y)
|
|
assert_allclose(device, intercept, clf.intercept_)
|
|
|
|
X, y = make_multilabel_classification( # pylint: disable=unbalanced-tuple-unpacking
|
|
random_state=1994, n_samples=128, n_features=16, n_classes=n_classes
|
|
)
|
|
|
|
# Multi-label
|
|
intercept = np.ones(shape=(n_classes), dtype=np.float32) / 2
|
|
if device == "cuda":
|
|
import cupy as cp
|
|
|
|
intercept = cp.array(intercept)
|
|
|
|
clf = XGBClassifier(base_score=intercept)
|
|
clf.fit(X, y)
|
|
assert_allclose(device, intercept, clf.intercept_)
|
|
assert clf.objective == "binary:logistic"
|