MLPproject/.venv/lib/python3.12/site-packages/xgboost/testing/with_skl.py

# pylint: disable=invalid-name, too-many-arguments, too-many-positional-arguments
"""Tests for compatiblity with sklearn."""

from typing import Callable, Optional, Type

import numpy as np
import pytest

from ..core import DMatrix
from ..sklearn import XGBClassifier, XGBRegressor, XGBRFRegressor
from .data import get_california_housing, make_batches
from .ordinal import make_recoded
from .utils import Device, assert_allclose


def run_boost_from_prediction_binary(
    tree_method: str,
    device: Device,
    X: np.ndarray,
    y: np.ndarray,
    as_frame: Optional[Callable],
) -> None:
    """
    Parameters
    ----------

    as_frame: A callable function to convert margin into DataFrame, useful for different
    df implementations.
    """

    model_0 = XGBClassifier(
        learning_rate=0.3,
        random_state=0,
        n_estimators=4,
        tree_method=tree_method,
        device=device,
    )
    model_0.fit(X=X, y=y)
    margin = model_0.predict(X, output_margin=True)
    if as_frame is not None:
        margin = as_frame(margin)

    model_1 = XGBClassifier(
        learning_rate=0.3,
        random_state=0,
        n_estimators=4,
        tree_method=tree_method,
        device=device,
    )
    model_1.fit(X=X, y=y, base_margin=margin)
    predictions_1 = model_1.predict(X, base_margin=margin)

    cls_2 = XGBClassifier(
        learning_rate=0.3,
        random_state=0,
        n_estimators=8,
        tree_method=tree_method,
        device=device,
    )
    cls_2.fit(X=X, y=y)
    predictions_2 = cls_2.predict(X)
    np.testing.assert_allclose(predictions_1, predictions_2)


def run_boost_from_prediction_multi_clasas(
    estimator: Type,
    tree_method: str,
    device: Device,
    X: np.ndarray,
    y: np.ndarray,
    as_frame: Optional[Callable],
) -> None:
    """Boosting from prediction with multi-class clf."""
    # Multi-class
    model_0 = estimator(
        learning_rate=0.3,
        random_state=0,
        n_estimators=4,
        tree_method=tree_method,
        device=device,
    )
    model_0.fit(X=X, y=y)
    margin = model_0.get_booster().inplace_predict(X, predict_type="margin")
    if as_frame is not None:
        margin = as_frame(margin)

    model_1 = estimator(
        learning_rate=0.3,
        random_state=0,
        n_estimators=4,
        tree_method=tree_method,
        device=device,
    )
    model_1.fit(X=X, y=y, base_margin=margin)
    predictions_1 = model_1.get_booster().predict(
        DMatrix(X, base_margin=margin), output_margin=True
    )

    model_2 = estimator(
        learning_rate=0.3,
        random_state=0,
        n_estimators=8,
        tree_method=tree_method,
        device=device,
    )
    model_2.fit(X=X, y=y)
    predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin")

    if hasattr(predictions_1, "get"):
        predictions_1 = predictions_1.get()
    if hasattr(predictions_2, "get"):
        predictions_2 = predictions_2.get()
    np.testing.assert_allclose(predictions_1, predictions_2, atol=1e-6)


def run_housing_rf_regression(tree_method: str, device: Device) -> None:
    """Testwith the cali housing dataset."""
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import KFold

    X, y = get_california_housing()
    rng = np.random.RandomState(1994)
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = XGBRFRegressor(
            random_state=42, tree_method=tree_method, device=device
        ).fit(X[train_index], y[train_index])
        preds = xgb_model.predict(X[test_index])
        labels = y[test_index]
        assert mean_squared_error(preds, labels) < 35

    rfreg = XGBRFRegressor(device=device)
    with pytest.raises(NotImplementedError):
        rfreg.set_params(early_stopping_rounds=10)
        rfreg.fit(X, y)


def run_recoding(device: Device) -> None:
    """Test re-coding for training continuation."""
    enc, reenc, y, _, _ = make_recoded(device, n_features=16)
    reg = XGBRegressor(enable_categorical=True, n_estimators=2, device=device)
    reg.fit(enc, y, eval_set=[(reenc, y)])
    results_0 = reg.evals_result()

    booster = reg.get_booster()
    assert not booster.get_categories().empty()

    reg = XGBRegressor(enable_categorical=True, n_estimators=2, device=device)
    reg.fit(reenc, y, xgb_model=booster, eval_set=[(enc, y)])
    results_1 = reg.evals_result()

    booster = reg.get_booster()
    assert booster.num_boosted_rounds() == 4
    assert not booster.get_categories().empty()

    reg = XGBRegressor(enable_categorical=True, n_estimators=4, device=device)
    reg.fit(enc, y, eval_set=[(reenc, y)])
    results_2 = reg.evals_result()

    np.testing.assert_allclose(
        results_2["validation_0"]["rmse"],
        results_0["validation_0"]["rmse"] + results_1["validation_0"]["rmse"],
    )

    np.testing.assert_allclose(reg.predict(reenc), reg.predict(enc))
    np.testing.assert_allclose(reg.apply(reenc), reg.apply(enc))


def run_intercept(device: Device) -> None:
    """Tests for the intercept."""
    from sklearn.datasets import make_classification, make_multilabel_classification

    X, y, w = [v[0] for v in make_batches(256, 3, 1, use_cupy=False)]
    reg = XGBRegressor(device=device)
    reg.fit(X, y, sample_weight=w)
    result = reg.intercept_
    assert result.dtype == np.float32
    assert result[0] < 0.5

    reg = XGBRegressor(booster="gblinear", device=device)
    reg.fit(X, y, sample_weight=w)
    result = reg.intercept_
    assert isinstance(result, np.ndarray)
    assert result.dtype == np.float32
    assert result[0] < 0.5

    n_classes = 4
    X, y = make_classification(
        random_state=1994,
        n_samples=128,
        n_features=16,
        n_classes=n_classes,
        n_informative=16,
        n_redundant=0,
    )

    clf = XGBClassifier(booster="gbtree", objective="multi:softprob", device=device)
    clf.fit(X, y)
    result = clf.intercept_
    assert isinstance(result, np.ndarray)
    assert len(result) == 4
    assert (result >= 0.0).all()
    np.testing.assert_allclose(sum(result), 1.0)

    # Tests for user input
    # Multi-class
    intercept = np.ones(shape=(n_classes), dtype=np.float32) / n_classes
    if device == "cuda":
        import cupy as cp

        intercept = cp.array(intercept)

    clf = XGBClassifier(objective="multi:softprob", base_score=intercept)
    clf.fit(X, y)
    assert_allclose(device, intercept, clf.intercept_)

    X, y = make_multilabel_classification(  # pylint: disable=unbalanced-tuple-unpacking
        random_state=1994, n_samples=128, n_features=16, n_classes=n_classes
    )

    # Multi-label
    intercept = np.ones(shape=(n_classes), dtype=np.float32) / 2
    if device == "cuda":
        import cupy as cp

        intercept = cp.array(intercept)

    clf = XGBClassifier(base_score=intercept)
    clf.fit(X, y)
    assert_allclose(device, intercept, clf.intercept_)
    assert clf.objective == "binary:logistic"