MLPproject/.venv/lib/python3.12/site-packages/xgboost/testing/interaction_constraints.py

"""Tests for interaction constraints."""

from typing import Optional, Sequence, Union

import numpy as np

from .._typing import FeatureNames
from ..core import DMatrix
from ..training import train
from .utils import Device


def run_interaction_constraints(  # pylint: disable=too-many-locals
    tree_method: str,
    device: Device,
    feature_names: Optional[FeatureNames] = None,
    interaction_constraints: Union[str, Sequence] = "[[0, 1]]",
) -> None:
    """Tests interaction constraints on a synthetic dataset."""
    x1 = np.random.normal(loc=1.0, scale=1.0, size=1000)
    x2 = np.random.normal(loc=1.0, scale=1.0, size=1000)
    x3 = np.random.choice([1, 2, 3], size=1000, replace=True)
    y = (
        x1
        + x2
        + x3
        + x1 * x2 * x3
        + np.random.normal(loc=0.001, scale=1.0, size=1000)
        + 3 * np.sin(x1)
    )
    X = np.column_stack((x1, x2, x3))
    dtrain = DMatrix(X, label=y, feature_names=feature_names)

    params = {
        "max_depth": 3,
        "eta": 0.1,
        "nthread": 2,
        "interaction_constraints": interaction_constraints,
        "tree_method": tree_method,
        "device": device,
    }
    num_boost_round = 12
    # Fit a model that only allows interaction between x1 and x2
    bst = train(params, dtrain, num_boost_round, evals=[(dtrain, "train")])

    # Set all observations to have the same x3 values then increment by the same amount
    def f(x: int) -> np.ndarray:
        tmat = DMatrix(
            np.column_stack((x1, x2, np.repeat(x, 1000))), feature_names=feature_names
        )
        return bst.predict(tmat)

    preds = [f(x) for x in [1, 2, 3]]

    # Check incrementing x3 has the same effect on all observations
    #   since x3 is constrained to be independent of x1 and x2
    #   and all observations start off from the same x3 value
    diff1 = preds[1] - preds[0]
    assert np.all(np.abs(diff1 - diff1[0]) < 1e-4)
    diff2 = preds[2] - preds[1]
    assert np.all(np.abs(diff2 - diff2[0]) < 1e-4)


def training_accuracy(tree_method: str, dpath: str, device: Device) -> None:
    """Test accuracy, reused by GPU tests."""
    from sklearn.metrics import accuracy_score

    dtrain = DMatrix(dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm")
    dtest = DMatrix(dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm")
    params = {
        "eta": 1,
        "max_depth": 6,
        "objective": "binary:logistic",
        "tree_method": tree_method,
        "device": device,
        "interaction_constraints": "[[1,2], [2,3,4]]",
    }
    num_boost_round = 5

    params["grow_policy"] = "lossguide"
    bst = train(params, dtrain, num_boost_round)
    pred_dtest = bst.predict(dtest) < 0.5
    assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1

    params["grow_policy"] = "depthwise"
    bst = train(params, dtrain, num_boost_round)
    pred_dtest = bst.predict(dtest) < 0.5
    assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1