Files
MLPproject/.venv/lib/python3.12/site-packages/catboost/eval/evaluation_result.py
2025-10-23 15:44:32 +02:00

519 lines
18 KiB
Python

import numpy as np
import pandas as pd
from enum import Enum
from .. import CatBoostError
from ..core import metric_description_or_str_to_str
from ..utils import compute_wx_test
def calc_wilcoxon_test(baseline, test):
return compute_wx_test(baseline, test)["pvalue"]
class ScoreType(Enum):
Abs = "AbsoluteDiff"
Rel = "RelativeDiff"
class ScoreConfig:
"""
Config to present human-friendly evaluation results.
"""
def __init__(self,
score_type=ScoreType.Rel,
multiplier=100,
score_level=0.01,
interval_level=0.01,
overfit_iterations_info=True):
"""
:param score_type: type of score. For abs difference score will be (baseline - test).mean(),
for relative it's ((baseline - test) / baseline).mean()
:param multiplier: multiplier to print score
:param score_level: WX-test level. Will be used to make if tested case significantly better or worse
:param interval_level: level to compute score confidence interval
:param overfit_iterations_info: if information about overfit iterations should be preserved
"""
self.type = score_type
self.multiplier = multiplier
self.score_level = score_level
self.interval_level = interval_level
self.overfit_overfit_iterations_info = overfit_iterations_info
@staticmethod
def abs_score(level=0.01):
return ScoreConfig(score_type=ScoreType.Abs,
multiplier=1,
score_level=level)
@staticmethod
def rel_score(level=0.01):
return ScoreConfig(score_type=ScoreType.Rel,
multiplier=100,
score_level=level)
def calc_bootstrap_ci_for_mean(samples, level=0.05, tries=999):
"""
Count confidence intervals for difference each two samples.
Args:
:param samples: samples
:param level: (float) Level for the confidence interval.
:param tries: bootstrap samples to use
:return: (left, right) border of confidence interval
"""
if not (samples == 0).all():
samples = np.array(samples)
means = []
for _ in range(0, tries):
resample = np.random.choice(samples, len(samples))
means.append(np.mean(resample))
means = sorted(means)
left = means[int(tries * (level / 2))]
right = means[int(tries * (1.0 - level / 2))]
return left, right
else:
return 0, 0
class CaseEvaluationResult:
"""
CaseEvaluationResults stores aggregated statistics for one EvaluationCase and one metric.
"""
def __init__(self, case, metric_description, eval_step):
self._case = case
self._metric_description = metric_description
self._fold_metric = pd.Series()
self._fold_metric_iteration = pd.Series()
self._fold_curves = dict()
self._eval_step = eval_step
def _add(self, model, learning_curve):
if model.get_case() != self._case:
raise CatBoostError("Model case should be equal to result case")
fold_id = model.get_fold_id()
self._fold_curves[fold_id] = learning_curve
score = max(learning_curve) if self._metric_description.is_max_optimal() else min(learning_curve)
position = np.argmax(learning_curve) if self._metric_description.is_max_optimal() else np.argmin(
learning_curve)
self._fold_metric.at[fold_id] = score
self._fold_metric_iteration.at[fold_id] = position
def __eq__(self, other):
return (np.all(self._fold_metric == other._fold_metric)
and np.all(self._fold_metric_iteration == other._fold_metric_iteration)
and self._fold_curves == other._fold_curves)
def get_case(self):
"""
ExecutionCases for this result
"""
return self._case
def get_fold_ids(self):
"""
:return: FoldsIds for which this caseResult was calculated
"""
return self._fold_curves.keys()
def get_best_metric_for_fold(self, fold):
"""
:param fold: id of fold to get result
:return: best metric value, best metric iteration
"""
return self._fold_metric[fold], self._fold_metric_iteration[fold]
def get_best_iterations(self):
"""
:return: pandas Series with best iterations on all folds
"""
return self._fold_metric_iteration
def get_best_metrics(self):
"""
:return: pandas series with best metric values
"""
return self._fold_metric
def get_fold_curve(self, fold):
"""
:param fold:
:return: fold learning curve (test scores on every eval_period iteration)
"""
return self._fold_curves[fold]
def get_metric_description(self):
"""
:return: Metric used to build this CaseEvaluationResult
"""
return self._metric_description
def get_eval_step(self):
"""
:return: step which was used for metric computations
"""
return self._eval_step
def count_under_and_over_fits(self, overfit_border=0.15, underfit_border=0.95):
"""
:param overfit_border: min fraction of iterations until overfitting starts one expects all models to have
:param underfit_border: border, after which there should be no best_metric_scores
:return: #models with best_metric > underfit_border * iter_count, #models, with best_metric > overfit_border
"""
count_overfitting = 0
count_underfitting = 0
for fold_id, fold_curve in self._fold_curves.items():
best_score_position = self._fold_metric_iteration[fold_id]
best_model_size_fraction = best_score_position * 1.0 / len(fold_curve)
if best_model_size_fraction > overfit_border:
count_underfitting += 1
elif best_model_size_fraction < underfit_border:
count_overfitting += 1
return count_overfitting, count_underfitting
def estimate_fit_quality(self):
"""
:return: Simple sanity check that all models overfit and not too fast
"""
count_overfitting, count_underfitting = self.count_under_and_over_fits()
if count_overfitting > count_underfitting:
return "Overfitting"
if count_underfitting > count_overfitting:
return "Underfitting"
return "Good"
def create_learning_curves_plot(self, offset=None):
"""
:param offset: First iteration to plot
:return: plotly Figure with learning curves for each fold
"""
import plotly.graph_objs as go
traces = []
for fold in self.get_fold_ids():
scores_curve = self.get_fold_curve(fold)
if offset is not None:
first_idx = offset
else:
first_idx = int(len(scores_curve) * 0.1)
traces.append(go.Scatter(x=[i * int(self._eval_step) for i in range(first_idx, len(scores_curve))],
y=scores_curve[first_idx:],
mode='lines',
name='Fold #{}'.format(fold)))
layout = go.Layout(
title='Learning curves for case {}'.format(self._case),
hovermode='closest',
xaxis=dict(
title='Iteration',
ticklen=5,
zeroline=False,
gridwidth=2,
),
yaxis=dict(
title='Metric',
ticklen=5,
gridwidth=2,
),
showlegend=True
)
fig = go.Figure(data=traces, layout=layout)
return fig
class MetricEvaluationResult:
"""
Evaluation result for one metric.
Stores all ExecutionCases with specified metric scores
Computes human-friendly tables with results and some plots
"""
def __init__(self, case_results):
if len(case_results) <= 1:
raise CatBoostError("Need at least 2 case results, got {} ".format(len(case_results)))
self._case_results = dict()
self._case_comparisons = dict()
self._cases = [case_result.get_case() for case_result in case_results]
for case_result in case_results:
case = case_result.get_case()
self._case_results[case] = case_result
self._metric_description = case_results[0].get_metric_description()
self._baseline_case = case_results[0].get_case()
self._score_config = ScoreConfig()
for (case, case_result) in self._case_results.items():
if case_result.get_metric_description() != self._metric_description:
raise CatBoostError("Metric names should be equal for all case results")
if case_result.get_fold_ids() != self.get_fold_ids():
raise CatBoostError("Case results should be computed on the same folds")
if case_result.get_eval_step() != self.get_eval_step():
raise CatBoostError("Eval steps should be equal for different cases")
def __clear_comparisons(self):
self._case_comparisons = dict()
def _change_score_config(self, config):
if config is not None:
if isinstance(config, ScoreType):
if config == ScoreType.Abs:
config = ScoreConfig.abs_score()
elif config == ScoreType.Rel:
config = ScoreConfig.rel_score()
else:
raise CatBoostError("Unknown scoreType {}".format(config))
if self._score_config != config:
self._score_config = config
self.__clear_comparisons()
def _compute_case_result_table(self, baseline_case):
result = pd.DataFrame()
baseline_scores = self._case_results[baseline_case].get_best_metrics()
baseline_iters = self._case_results[baseline_case].get_best_iterations()
for (case, case_result) in self._case_results.items():
if case != baseline_case:
test_scores = case_result.get_best_metrics()
pvalue = calc_wilcoxon_test(baseline_scores, test_scores)
diff = (baseline_scores - test_scores)
if self._score_config.type == ScoreType.Rel:
diff = diff / baseline_scores.abs()
if self._metric_description.is_max_optimal():
diff = -diff
mean_diff = diff.mean()
left_quantile, right_quantile = calc_bootstrap_ci_for_mean(diff,
self._score_config.interval_level)
case_name = str(case)
result.at[case_name, "PValue"] = pvalue
result.at[case_name, "Score"] = mean_diff * self._score_config.multiplier
left_quantile_title = "Quantile {}".format(self._score_config.score_level / 2)
right_quantile_title = "Quantile {}".format(1.0 - self._score_config.score_level / 2)
result.at[case_name, left_quantile_title] = left_quantile * self._score_config.multiplier
result.at[case_name, right_quantile_title] = right_quantile * self._score_config.multiplier
decision = "UNKNOWN"
if pvalue < self._score_config.score_level:
if mean_diff > 0:
decision = "GOOD"
elif mean_diff < 0:
decision = "BAD"
result.at[case_name, "Decision"] = decision
if self._score_config.overfit_overfit_iterations_info:
test_iters = case_result.get_best_iterations()
pvalue = calc_wilcoxon_test(baseline_iters, test_iters)
result.at[case_name, "Overfit iter diff"] = (test_iters - baseline_iters).mean()
result.at[case_name, "Overfit iter pValue"] = pvalue
return result.sort_values(by=["Score"], ascending=self._metric_description.is_max_optimal())
def get_baseline_case(self):
"""
:return: ExecutionCases used as a baseline (with everything else is compared)
"""
return self._baseline_case
def get_cases(self):
"""
:return: Cases which are compared
"""
return self._cases
def get_metric_description(self):
"""
:return: Metric for which results were calculated
"""
return self._metric_description
def get_baseline_comparison(self, score_config=None):
"""
Method to get human-friendly table with model comparisons.
Returns baseline vs all other computed cases result
:param score_config: Config to present human-friendly score, optional. Instance of ScoreConfig
:return: pandas DataFrame. Each row is related to one ExecutionCase.
Each row describes how better (or worse) this case is compared to baseline.
"""
case = self._baseline_case
return self.get_case_comparison(case, score_config)
def get_case_comparison(self, case, score_config=None):
"""
Method to get human-friendly table with model comparisons.
Same as get_baseline_comparison(), but with other non-baseline case specified as baseline
:param case: use specified case as baseline
:param score_config:
:return: pandas DataFrame. Each row is related to one ExecutionCase.
Each row describes how better (or worse) this case is compared to baseline.
"""
self._change_score_config(score_config)
if case not in self._case_comparisons:
self._case_comparisons[case] = self._compute_case_result_table(case)
return self._case_comparisons[case]
def change_baseline_case(self, case):
"""
:param case: new baseline case
:return:
"""
if case not in self._case_results:
raise CatBoostError("Case {} is unknown. Can't use it as baseline".format(case))
self._baseline_case = case
def get_case_result(self, case):
"""
:param case:
:return: CaseEvaluationResult. Scores and other information about single execution case
"""
return self._case_results[case]
def get_fold_ids(self):
"""
:return: Folds ids which we used for computing this evaluation result
"""
return self._case_results[self._baseline_case].get_fold_ids()
def get_eval_step(self):
return self._case_results[self._baseline_case].get_eval_step()
def create_fold_learning_curves(self, fold, offset=None):
"""
:param fold: FoldId to plot
:param offset: first iteration to plot
:return: plotly figure for all cases on specified fold
"""
import plotly.graph_objs as go
traces = []
for case in self.get_cases():
case_result = self.get_case_result(case)
scores_curve = case_result.get_fold_curve(fold)
if offset is not None:
first_idx = offset
else:
first_idx = int(len(scores_curve) * 0.1)
traces.append(
go.Scatter(x=[i * int(case_result.get_eval_step()) for i in range(first_idx, len(scores_curve))],
y=scores_curve[first_idx:],
mode='lines',
name='Case {}'.format(case)))
layout = go.Layout(
title='Learning curves for metric {} on fold #{}'.format(self._metric_description, fold),
hovermode='closest',
xaxis=dict(
title='Iteration',
ticklen=5,
zeroline=False,
gridwidth=2,
),
yaxis=dict(
title='Metric',
ticklen=5,
gridwidth=2,
),
showlegend=True
)
fig = go.Figure(data=traces, layout=layout)
return fig
def __eq__(self, other):
return (self._case_results == other._case_results
and self._case_comparisons == other._case_comparisons
and self._cases == other._cases)
class EvaluationResults:
def __init__(self, metric_results):
if len(metric_results) < 1:
raise CatBoostError("Need at least one result")
self._results = dict()
self._metrics = dict()
self._cases = None
for result in metric_results:
metric_description = result.get_metric_description()
if metric_description in self._results:
raise CatBoostError("Duplicate metric {}".format(metric_description))
if self._cases is None:
self._cases = result.get_cases()
key = metric_description_or_str_to_str(metric_description)
self._results[key] = result
self._metrics[key] = metric_description
def get_metric_results(self, metric):
"""
:param metric:
:return: MetricEvaluationResult for specified metric
"""
return self._results[metric_description_or_str_to_str(metric)]
def get_metrics(self):
"""
:return: Names of the metrics which were computed
"""
return self._metrics
def get_results(self):
"""
:return: Results are the map from metric names to computed results (instance of MetricEvaluationResult)
on this fold
"""
return self._results
def set_baseline_case(self, case):
"""
Could be used to change baseline cases for already computed results
"""
for (metric, metric_result) in self._results.items():
metric_result.change_baseline_case(case)