Files
MLPproject/.venv/lib/python3.12/site-packages/xgboost/testing/with_skl.py
2025-10-23 15:44:32 +02:00

232 lines
6.9 KiB
Python

# pylint: disable=invalid-name, too-many-arguments, too-many-positional-arguments
"""Tests for compatiblity with sklearn."""
from typing import Callable, Optional, Type
import numpy as np
import pytest
from ..core import DMatrix
from ..sklearn import XGBClassifier, XGBRegressor, XGBRFRegressor
from .data import get_california_housing, make_batches
from .ordinal import make_recoded
from .utils import Device, assert_allclose
def run_boost_from_prediction_binary(
tree_method: str,
device: Device,
X: np.ndarray,
y: np.ndarray,
as_frame: Optional[Callable],
) -> None:
"""
Parameters
----------
as_frame: A callable function to convert margin into DataFrame, useful for different
df implementations.
"""
model_0 = XGBClassifier(
learning_rate=0.3,
random_state=0,
n_estimators=4,
tree_method=tree_method,
device=device,
)
model_0.fit(X=X, y=y)
margin = model_0.predict(X, output_margin=True)
if as_frame is not None:
margin = as_frame(margin)
model_1 = XGBClassifier(
learning_rate=0.3,
random_state=0,
n_estimators=4,
tree_method=tree_method,
device=device,
)
model_1.fit(X=X, y=y, base_margin=margin)
predictions_1 = model_1.predict(X, base_margin=margin)
cls_2 = XGBClassifier(
learning_rate=0.3,
random_state=0,
n_estimators=8,
tree_method=tree_method,
device=device,
)
cls_2.fit(X=X, y=y)
predictions_2 = cls_2.predict(X)
np.testing.assert_allclose(predictions_1, predictions_2)
def run_boost_from_prediction_multi_clasas(
estimator: Type,
tree_method: str,
device: Device,
X: np.ndarray,
y: np.ndarray,
as_frame: Optional[Callable],
) -> None:
"""Boosting from prediction with multi-class clf."""
# Multi-class
model_0 = estimator(
learning_rate=0.3,
random_state=0,
n_estimators=4,
tree_method=tree_method,
device=device,
)
model_0.fit(X=X, y=y)
margin = model_0.get_booster().inplace_predict(X, predict_type="margin")
if as_frame is not None:
margin = as_frame(margin)
model_1 = estimator(
learning_rate=0.3,
random_state=0,
n_estimators=4,
tree_method=tree_method,
device=device,
)
model_1.fit(X=X, y=y, base_margin=margin)
predictions_1 = model_1.get_booster().predict(
DMatrix(X, base_margin=margin), output_margin=True
)
model_2 = estimator(
learning_rate=0.3,
random_state=0,
n_estimators=8,
tree_method=tree_method,
device=device,
)
model_2.fit(X=X, y=y)
predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin")
if hasattr(predictions_1, "get"):
predictions_1 = predictions_1.get()
if hasattr(predictions_2, "get"):
predictions_2 = predictions_2.get()
np.testing.assert_allclose(predictions_1, predictions_2, atol=1e-6)
def run_housing_rf_regression(tree_method: str, device: Device) -> None:
"""Testwith the cali housing dataset."""
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
X, y = get_california_housing()
rng = np.random.RandomState(1994)
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
xgb_model = XGBRFRegressor(
random_state=42, tree_method=tree_method, device=device
).fit(X[train_index], y[train_index])
preds = xgb_model.predict(X[test_index])
labels = y[test_index]
assert mean_squared_error(preds, labels) < 35
rfreg = XGBRFRegressor(device=device)
with pytest.raises(NotImplementedError):
rfreg.set_params(early_stopping_rounds=10)
rfreg.fit(X, y)
def run_recoding(device: Device) -> None:
"""Test re-coding for training continuation."""
enc, reenc, y, _, _ = make_recoded(device, n_features=16)
reg = XGBRegressor(enable_categorical=True, n_estimators=2, device=device)
reg.fit(enc, y, eval_set=[(reenc, y)])
results_0 = reg.evals_result()
booster = reg.get_booster()
assert not booster.get_categories().empty()
reg = XGBRegressor(enable_categorical=True, n_estimators=2, device=device)
reg.fit(reenc, y, xgb_model=booster, eval_set=[(enc, y)])
results_1 = reg.evals_result()
booster = reg.get_booster()
assert booster.num_boosted_rounds() == 4
assert not booster.get_categories().empty()
reg = XGBRegressor(enable_categorical=True, n_estimators=4, device=device)
reg.fit(enc, y, eval_set=[(reenc, y)])
results_2 = reg.evals_result()
np.testing.assert_allclose(
results_2["validation_0"]["rmse"],
results_0["validation_0"]["rmse"] + results_1["validation_0"]["rmse"],
)
np.testing.assert_allclose(reg.predict(reenc), reg.predict(enc))
np.testing.assert_allclose(reg.apply(reenc), reg.apply(enc))
def run_intercept(device: Device) -> None:
"""Tests for the intercept."""
from sklearn.datasets import make_classification, make_multilabel_classification
X, y, w = [v[0] for v in make_batches(256, 3, 1, use_cupy=False)]
reg = XGBRegressor(device=device)
reg.fit(X, y, sample_weight=w)
result = reg.intercept_
assert result.dtype == np.float32
assert result[0] < 0.5
reg = XGBRegressor(booster="gblinear", device=device)
reg.fit(X, y, sample_weight=w)
result = reg.intercept_
assert isinstance(result, np.ndarray)
assert result.dtype == np.float32
assert result[0] < 0.5
n_classes = 4
X, y = make_classification(
random_state=1994,
n_samples=128,
n_features=16,
n_classes=n_classes,
n_informative=16,
n_redundant=0,
)
clf = XGBClassifier(booster="gbtree", objective="multi:softprob", device=device)
clf.fit(X, y)
result = clf.intercept_
assert isinstance(result, np.ndarray)
assert len(result) == 4
assert (result >= 0.0).all()
np.testing.assert_allclose(sum(result), 1.0)
# Tests for user input
# Multi-class
intercept = np.ones(shape=(n_classes), dtype=np.float32) / n_classes
if device == "cuda":
import cupy as cp
intercept = cp.array(intercept)
clf = XGBClassifier(objective="multi:softprob", base_score=intercept)
clf.fit(X, y)
assert_allclose(device, intercept, clf.intercept_)
X, y = make_multilabel_classification( # pylint: disable=unbalanced-tuple-unpacking
random_state=1994, n_samples=128, n_features=16, n_classes=n_classes
)
# Multi-label
intercept = np.ones(shape=(n_classes), dtype=np.float32) / 2
if device == "cuda":
import cupy as cp
intercept = cp.array(intercept)
clf = XGBClassifier(base_score=intercept)
clf.fit(X, y)
assert_allclose(device, intercept, clf.intercept_)
assert clf.objective == "binary:logistic"