519 lines
18 KiB
Python
519 lines
18 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
from enum import Enum
|
|
|
|
from .. import CatBoostError
|
|
from ..core import metric_description_or_str_to_str
|
|
from ..utils import compute_wx_test
|
|
|
|
|
|
def calc_wilcoxon_test(baseline, test):
|
|
return compute_wx_test(baseline, test)["pvalue"]
|
|
|
|
|
|
class ScoreType(Enum):
|
|
Abs = "AbsoluteDiff"
|
|
Rel = "RelativeDiff"
|
|
|
|
|
|
class ScoreConfig:
|
|
"""
|
|
Config to present human-friendly evaluation results.
|
|
"""
|
|
|
|
def __init__(self,
|
|
score_type=ScoreType.Rel,
|
|
multiplier=100,
|
|
score_level=0.01,
|
|
interval_level=0.01,
|
|
overfit_iterations_info=True):
|
|
"""
|
|
|
|
:param score_type: type of score. For abs difference score will be (baseline - test).mean(),
|
|
for relative it's ((baseline - test) / baseline).mean()
|
|
:param multiplier: multiplier to print score
|
|
:param score_level: WX-test level. Will be used to make if tested case significantly better or worse
|
|
:param interval_level: level to compute score confidence interval
|
|
:param overfit_iterations_info: if information about overfit iterations should be preserved
|
|
"""
|
|
self.type = score_type
|
|
self.multiplier = multiplier
|
|
self.score_level = score_level
|
|
self.interval_level = interval_level
|
|
self.overfit_overfit_iterations_info = overfit_iterations_info
|
|
|
|
@staticmethod
|
|
def abs_score(level=0.01):
|
|
return ScoreConfig(score_type=ScoreType.Abs,
|
|
multiplier=1,
|
|
score_level=level)
|
|
|
|
@staticmethod
|
|
def rel_score(level=0.01):
|
|
return ScoreConfig(score_type=ScoreType.Rel,
|
|
multiplier=100,
|
|
score_level=level)
|
|
|
|
|
|
def calc_bootstrap_ci_for_mean(samples, level=0.05, tries=999):
|
|
"""
|
|
Count confidence intervals for difference each two samples.
|
|
|
|
Args:
|
|
:param samples: samples
|
|
:param level: (float) Level for the confidence interval.
|
|
:param tries: bootstrap samples to use
|
|
:return: (left, right) border of confidence interval
|
|
|
|
"""
|
|
|
|
if not (samples == 0).all():
|
|
samples = np.array(samples)
|
|
means = []
|
|
for _ in range(0, tries):
|
|
resample = np.random.choice(samples, len(samples))
|
|
means.append(np.mean(resample))
|
|
means = sorted(means)
|
|
left = means[int(tries * (level / 2))]
|
|
right = means[int(tries * (1.0 - level / 2))]
|
|
return left, right
|
|
else:
|
|
return 0, 0
|
|
|
|
|
|
class CaseEvaluationResult:
|
|
"""
|
|
CaseEvaluationResults stores aggregated statistics for one EvaluationCase and one metric.
|
|
"""
|
|
|
|
def __init__(self, case, metric_description, eval_step):
|
|
self._case = case
|
|
self._metric_description = metric_description
|
|
|
|
self._fold_metric = pd.Series()
|
|
self._fold_metric_iteration = pd.Series()
|
|
self._fold_curves = dict()
|
|
|
|
self._eval_step = eval_step
|
|
|
|
def _add(self, model, learning_curve):
|
|
if model.get_case() != self._case:
|
|
raise CatBoostError("Model case should be equal to result case")
|
|
|
|
fold_id = model.get_fold_id()
|
|
|
|
self._fold_curves[fold_id] = learning_curve
|
|
score = max(learning_curve) if self._metric_description.is_max_optimal() else min(learning_curve)
|
|
position = np.argmax(learning_curve) if self._metric_description.is_max_optimal() else np.argmin(
|
|
learning_curve)
|
|
|
|
self._fold_metric.at[fold_id] = score
|
|
self._fold_metric_iteration.at[fold_id] = position
|
|
|
|
def __eq__(self, other):
|
|
return (np.all(self._fold_metric == other._fold_metric)
|
|
and np.all(self._fold_metric_iteration == other._fold_metric_iteration)
|
|
and self._fold_curves == other._fold_curves)
|
|
|
|
def get_case(self):
|
|
"""
|
|
ExecutionCases for this result
|
|
"""
|
|
return self._case
|
|
|
|
def get_fold_ids(self):
|
|
"""
|
|
|
|
:return: FoldsIds for which this caseResult was calculated
|
|
"""
|
|
return self._fold_curves.keys()
|
|
|
|
def get_best_metric_for_fold(self, fold):
|
|
"""
|
|
|
|
:param fold: id of fold to get result
|
|
:return: best metric value, best metric iteration
|
|
"""
|
|
return self._fold_metric[fold], self._fold_metric_iteration[fold]
|
|
|
|
def get_best_iterations(self):
|
|
"""
|
|
|
|
:return: pandas Series with best iterations on all folds
|
|
"""
|
|
return self._fold_metric_iteration
|
|
|
|
def get_best_metrics(self):
|
|
"""
|
|
|
|
:return: pandas series with best metric values
|
|
"""
|
|
return self._fold_metric
|
|
|
|
def get_fold_curve(self, fold):
|
|
"""
|
|
|
|
:param fold:
|
|
:return: fold learning curve (test scores on every eval_period iteration)
|
|
"""
|
|
return self._fold_curves[fold]
|
|
|
|
def get_metric_description(self):
|
|
"""
|
|
|
|
:return: Metric used to build this CaseEvaluationResult
|
|
"""
|
|
return self._metric_description
|
|
|
|
def get_eval_step(self):
|
|
"""
|
|
|
|
:return: step which was used for metric computations
|
|
"""
|
|
return self._eval_step
|
|
|
|
def count_under_and_over_fits(self, overfit_border=0.15, underfit_border=0.95):
|
|
"""
|
|
|
|
:param overfit_border: min fraction of iterations until overfitting starts one expects all models to have
|
|
:param underfit_border: border, after which there should be no best_metric_scores
|
|
:return: #models with best_metric > underfit_border * iter_count, #models, with best_metric > overfit_border
|
|
"""
|
|
count_overfitting = 0
|
|
count_underfitting = 0
|
|
|
|
for fold_id, fold_curve in self._fold_curves.items():
|
|
best_score_position = self._fold_metric_iteration[fold_id]
|
|
best_model_size_fraction = best_score_position * 1.0 / len(fold_curve)
|
|
if best_model_size_fraction > overfit_border:
|
|
count_underfitting += 1
|
|
elif best_model_size_fraction < underfit_border:
|
|
count_overfitting += 1
|
|
return count_overfitting, count_underfitting
|
|
|
|
def estimate_fit_quality(self):
|
|
"""
|
|
|
|
:return: Simple sanity check that all models overfit and not too fast
|
|
"""
|
|
count_overfitting, count_underfitting = self.count_under_and_over_fits()
|
|
if count_overfitting > count_underfitting:
|
|
return "Overfitting"
|
|
if count_underfitting > count_overfitting:
|
|
return "Underfitting"
|
|
return "Good"
|
|
|
|
def create_learning_curves_plot(self, offset=None):
|
|
"""
|
|
|
|
:param offset: First iteration to plot
|
|
:return: plotly Figure with learning curves for each fold
|
|
"""
|
|
import plotly.graph_objs as go
|
|
|
|
traces = []
|
|
|
|
for fold in self.get_fold_ids():
|
|
scores_curve = self.get_fold_curve(fold)
|
|
if offset is not None:
|
|
first_idx = offset
|
|
else:
|
|
first_idx = int(len(scores_curve) * 0.1)
|
|
|
|
traces.append(go.Scatter(x=[i * int(self._eval_step) for i in range(first_idx, len(scores_curve))],
|
|
y=scores_curve[first_idx:],
|
|
mode='lines',
|
|
name='Fold #{}'.format(fold)))
|
|
|
|
layout = go.Layout(
|
|
title='Learning curves for case {}'.format(self._case),
|
|
hovermode='closest',
|
|
xaxis=dict(
|
|
title='Iteration',
|
|
ticklen=5,
|
|
zeroline=False,
|
|
gridwidth=2,
|
|
),
|
|
yaxis=dict(
|
|
title='Metric',
|
|
ticklen=5,
|
|
gridwidth=2,
|
|
),
|
|
showlegend=True
|
|
)
|
|
fig = go.Figure(data=traces, layout=layout)
|
|
return fig
|
|
|
|
|
|
class MetricEvaluationResult:
|
|
"""
|
|
Evaluation result for one metric.
|
|
Stores all ExecutionCases with specified metric scores
|
|
Computes human-friendly tables with results and some plots
|
|
"""
|
|
|
|
def __init__(self, case_results):
|
|
if len(case_results) <= 1:
|
|
raise CatBoostError("Need at least 2 case results, got {} ".format(len(case_results)))
|
|
|
|
self._case_results = dict()
|
|
self._case_comparisons = dict()
|
|
self._cases = [case_result.get_case() for case_result in case_results]
|
|
|
|
for case_result in case_results:
|
|
case = case_result.get_case()
|
|
self._case_results[case] = case_result
|
|
|
|
self._metric_description = case_results[0].get_metric_description()
|
|
self._baseline_case = case_results[0].get_case()
|
|
|
|
self._score_config = ScoreConfig()
|
|
|
|
for (case, case_result) in self._case_results.items():
|
|
if case_result.get_metric_description() != self._metric_description:
|
|
raise CatBoostError("Metric names should be equal for all case results")
|
|
|
|
if case_result.get_fold_ids() != self.get_fold_ids():
|
|
raise CatBoostError("Case results should be computed on the same folds")
|
|
|
|
if case_result.get_eval_step() != self.get_eval_step():
|
|
raise CatBoostError("Eval steps should be equal for different cases")
|
|
|
|
def __clear_comparisons(self):
|
|
self._case_comparisons = dict()
|
|
|
|
def _change_score_config(self, config):
|
|
if config is not None:
|
|
if isinstance(config, ScoreType):
|
|
if config == ScoreType.Abs:
|
|
config = ScoreConfig.abs_score()
|
|
elif config == ScoreType.Rel:
|
|
config = ScoreConfig.rel_score()
|
|
else:
|
|
raise CatBoostError("Unknown scoreType {}".format(config))
|
|
if self._score_config != config:
|
|
self._score_config = config
|
|
self.__clear_comparisons()
|
|
|
|
def _compute_case_result_table(self, baseline_case):
|
|
result = pd.DataFrame()
|
|
baseline_scores = self._case_results[baseline_case].get_best_metrics()
|
|
baseline_iters = self._case_results[baseline_case].get_best_iterations()
|
|
|
|
for (case, case_result) in self._case_results.items():
|
|
if case != baseline_case:
|
|
test_scores = case_result.get_best_metrics()
|
|
pvalue = calc_wilcoxon_test(baseline_scores, test_scores)
|
|
|
|
diff = (baseline_scores - test_scores)
|
|
if self._score_config.type == ScoreType.Rel:
|
|
diff = diff / baseline_scores.abs()
|
|
if self._metric_description.is_max_optimal():
|
|
diff = -diff
|
|
mean_diff = diff.mean()
|
|
|
|
left_quantile, right_quantile = calc_bootstrap_ci_for_mean(diff,
|
|
self._score_config.interval_level)
|
|
|
|
case_name = str(case)
|
|
|
|
result.at[case_name, "PValue"] = pvalue
|
|
result.at[case_name, "Score"] = mean_diff * self._score_config.multiplier
|
|
|
|
left_quantile_title = "Quantile {}".format(self._score_config.score_level / 2)
|
|
right_quantile_title = "Quantile {}".format(1.0 - self._score_config.score_level / 2)
|
|
|
|
result.at[case_name, left_quantile_title] = left_quantile * self._score_config.multiplier
|
|
result.at[case_name, right_quantile_title] = right_quantile * self._score_config.multiplier
|
|
|
|
decision = "UNKNOWN"
|
|
if pvalue < self._score_config.score_level:
|
|
if mean_diff > 0:
|
|
decision = "GOOD"
|
|
elif mean_diff < 0:
|
|
decision = "BAD"
|
|
result.at[case_name, "Decision"] = decision
|
|
|
|
if self._score_config.overfit_overfit_iterations_info:
|
|
test_iters = case_result.get_best_iterations()
|
|
pvalue = calc_wilcoxon_test(baseline_iters, test_iters)
|
|
|
|
result.at[case_name, "Overfit iter diff"] = (test_iters - baseline_iters).mean()
|
|
result.at[case_name, "Overfit iter pValue"] = pvalue
|
|
|
|
return result.sort_values(by=["Score"], ascending=self._metric_description.is_max_optimal())
|
|
|
|
def get_baseline_case(self):
|
|
"""
|
|
|
|
:return: ExecutionCases used as a baseline (with everything else is compared)
|
|
"""
|
|
return self._baseline_case
|
|
|
|
def get_cases(self):
|
|
"""
|
|
|
|
:return: Cases which are compared
|
|
"""
|
|
return self._cases
|
|
|
|
def get_metric_description(self):
|
|
"""
|
|
|
|
:return: Metric for which results were calculated
|
|
"""
|
|
return self._metric_description
|
|
|
|
def get_baseline_comparison(self, score_config=None):
|
|
"""
|
|
Method to get human-friendly table with model comparisons.
|
|
|
|
Returns baseline vs all other computed cases result
|
|
:param score_config: Config to present human-friendly score, optional. Instance of ScoreConfig
|
|
:return: pandas DataFrame. Each row is related to one ExecutionCase.
|
|
Each row describes how better (or worse) this case is compared to baseline.
|
|
"""
|
|
case = self._baseline_case
|
|
return self.get_case_comparison(case, score_config)
|
|
|
|
def get_case_comparison(self, case, score_config=None):
|
|
"""
|
|
Method to get human-friendly table with model comparisons.
|
|
Same as get_baseline_comparison(), but with other non-baseline case specified as baseline
|
|
|
|
:param case: use specified case as baseline
|
|
:param score_config:
|
|
:return: pandas DataFrame. Each row is related to one ExecutionCase.
|
|
Each row describes how better (or worse) this case is compared to baseline.
|
|
"""
|
|
self._change_score_config(score_config)
|
|
if case not in self._case_comparisons:
|
|
self._case_comparisons[case] = self._compute_case_result_table(case)
|
|
return self._case_comparisons[case]
|
|
|
|
def change_baseline_case(self, case):
|
|
"""
|
|
|
|
:param case: new baseline case
|
|
:return:
|
|
"""
|
|
if case not in self._case_results:
|
|
raise CatBoostError("Case {} is unknown. Can't use it as baseline".format(case))
|
|
self._baseline_case = case
|
|
|
|
def get_case_result(self, case):
|
|
"""
|
|
|
|
:param case:
|
|
:return: CaseEvaluationResult. Scores and other information about single execution case
|
|
"""
|
|
return self._case_results[case]
|
|
|
|
def get_fold_ids(self):
|
|
"""
|
|
|
|
:return: Folds ids which we used for computing this evaluation result
|
|
"""
|
|
return self._case_results[self._baseline_case].get_fold_ids()
|
|
|
|
def get_eval_step(self):
|
|
return self._case_results[self._baseline_case].get_eval_step()
|
|
|
|
def create_fold_learning_curves(self, fold, offset=None):
|
|
"""
|
|
|
|
:param fold: FoldId to plot
|
|
:param offset: first iteration to plot
|
|
:return: plotly figure for all cases on specified fold
|
|
"""
|
|
import plotly.graph_objs as go
|
|
traces = []
|
|
for case in self.get_cases():
|
|
case_result = self.get_case_result(case)
|
|
scores_curve = case_result.get_fold_curve(fold)
|
|
if offset is not None:
|
|
first_idx = offset
|
|
else:
|
|
first_idx = int(len(scores_curve) * 0.1)
|
|
|
|
traces.append(
|
|
go.Scatter(x=[i * int(case_result.get_eval_step()) for i in range(first_idx, len(scores_curve))],
|
|
y=scores_curve[first_idx:],
|
|
mode='lines',
|
|
name='Case {}'.format(case)))
|
|
|
|
layout = go.Layout(
|
|
title='Learning curves for metric {} on fold #{}'.format(self._metric_description, fold),
|
|
hovermode='closest',
|
|
xaxis=dict(
|
|
title='Iteration',
|
|
ticklen=5,
|
|
zeroline=False,
|
|
gridwidth=2,
|
|
),
|
|
yaxis=dict(
|
|
title='Metric',
|
|
ticklen=5,
|
|
gridwidth=2,
|
|
),
|
|
showlegend=True
|
|
)
|
|
fig = go.Figure(data=traces, layout=layout)
|
|
return fig
|
|
|
|
def __eq__(self, other):
|
|
return (self._case_results == other._case_results
|
|
and self._case_comparisons == other._case_comparisons
|
|
and self._cases == other._cases)
|
|
|
|
|
|
class EvaluationResults:
|
|
|
|
def __init__(self, metric_results):
|
|
if len(metric_results) < 1:
|
|
raise CatBoostError("Need at least one result")
|
|
|
|
self._results = dict()
|
|
self._metrics = dict()
|
|
self._cases = None
|
|
|
|
for result in metric_results:
|
|
metric_description = result.get_metric_description()
|
|
if metric_description in self._results:
|
|
raise CatBoostError("Duplicate metric {}".format(metric_description))
|
|
if self._cases is None:
|
|
self._cases = result.get_cases()
|
|
key = metric_description_or_str_to_str(metric_description)
|
|
self._results[key] = result
|
|
self._metrics[key] = metric_description
|
|
|
|
def get_metric_results(self, metric):
|
|
"""
|
|
|
|
:param metric:
|
|
:return: MetricEvaluationResult for specified metric
|
|
"""
|
|
return self._results[metric_description_or_str_to_str(metric)]
|
|
|
|
def get_metrics(self):
|
|
"""
|
|
|
|
:return: Names of the metrics which were computed
|
|
"""
|
|
return self._metrics
|
|
|
|
def get_results(self):
|
|
"""
|
|
|
|
:return: Results are the map from metric names to computed results (instance of MetricEvaluationResult)
|
|
on this fold
|
|
"""
|
|
return self._results
|
|
|
|
def set_baseline_case(self, case):
|
|
"""
|
|
Could be used to change baseline cases for already computed results
|
|
"""
|
|
for (metric, metric_result) in self._results.items():
|
|
metric_result.change_baseline_case(case)
|