Files
MLPproject/.venv/lib/python3.12/site-packages/catboost/utils.py
2025-10-23 15:44:32 +02:00

766 lines
27 KiB
Python

from . import _catboost
from .core import Pool, CatBoostError, ARRAY_TYPES, PATH_TYPES, fspath, _update_params_quantize_part, _process_synonyms
from collections import defaultdict
from contextlib import contextmanager
import sys
import numpy as np
import warnings
_eval_metric_util = _catboost._eval_metric_util
_get_roc_curve = _catboost._get_roc_curve
_get_confusion_matrix = _catboost._get_confusion_matrix
_select_threshold = _catboost._select_threshold
_NumpyAwareEncoder = _catboost._NumpyAwareEncoder
_get_onnx_model = _catboost._get_onnx_model
_calculate_quantization_grid = _catboost._calculate_quantization_grid
compute_wx_test = _catboost.compute_wx_test
TargetStats = _catboost.TargetStats
DataMetaInfo = _catboost.DataMetaInfo
compute_training_options = _catboost.compute_training_options
@contextmanager
def _import_matplotlib():
try:
import matplotlib.pyplot as plt
except ImportError as e:
warnings.warn("To draw plots you should install matplotlib.")
raise ImportError(str(e))
yield plt
def _draw(plt, x, y, x_label, y_label, title):
plt.figure(figsize=(16, 8))
plt.plot(x, y, alpha=0.5, lw=2)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel(x_label, fontsize=16)
plt.ylabel(y_label, fontsize=16)
plt.title(title, fontsize=20)
plt.show()
def create_cd(
label=None,
cat_features=None,
text_features=None,
embedding_features=None,
weight=None,
baseline=None,
doc_id=None,
group_id=None,
subgroup_id=None,
timestamp=None,
auxiliary_columns=None,
feature_names=None,
output_path='train.cd'
):
_from_param_to_cd = {
'label': 'Label',
'weight': 'Weight',
'baseline': 'Baseline',
'doc_id': 'DocId',
'group_id': 'GroupId',
'subgroup_id': 'SubgroupId',
'timestamp': 'Timestamp'
}
_column_description = defaultdict(lambda: ['Num', ''])
for key, value in locals().copy().items():
if not (key.startswith('_') or value is None):
if key in ('cat_features', 'text_features', 'embedding_features', 'auxiliary_columns'):
if isinstance(value, int):
value = [value]
for index in value:
if not isinstance(index, int):
raise CatBoostError('Unsupported index type. Expected int, got {}'.format(type(index)))
if index in _column_description:
raise CatBoostError('The index {} occurs more than once'.format(index))
if key == 'cat_features':
_column_description[index] = ['Categ', '']
elif key == 'text_features':
_column_description[index] = ['Text', '']
elif key == 'embedding_features':
_column_description[index] = ['NumVector', '']
else:
_column_description[index] = ['Auxiliary', '']
elif key not in ('feature_names', 'output_path'):
if not isinstance(value, int):
raise CatBoostError('Unsupported index type. Expected int, got {}'.format(type(value)))
if value in _column_description:
raise CatBoostError('The index {} occurs more than once'.format(value))
_column_description[value] = [_from_param_to_cd[key], '']
if feature_names is not None:
for feature_column_index, name in feature_names.items():
if _column_description[feature_column_index][0] not in ('Num', 'Categ', 'Text', 'NumVector'):
raise CatBoostError('feature_names contains index {} that does not correspond to feature column'.format(feature_column_index))
_column_description[feature_column_index][1] = name
with open(fspath(output_path), 'w') as f:
for index, (title, name) in sorted(_column_description.items()):
f.write('{}\t{}\t{}\n'.format(index, title, name))
def read_cd(cd_file, column_count=None, data_file=None, canonize_column_types=False):
"""
Reads CatBoost column description file
(see https://catboost.ai/docs/concepts/input-data_column-descfile.html#input-data_column-descfile)
Parameters
----------
cd_file : str or pathlib.Path
path to column description file
column_count : integer
total number of columns
data_file : str or pathlib.Path
path to dataset file in CatBoost format
specify either column_count directly or data_file to detect it
canonize_column_types : bool
if set to True types for columns with synonyms are renamed to canonical type.
Returns
-------
dict with keys:
"column_type_to_indices" :
dict of column_type -> column_indices list, column_type is 'Label', 'Categ' etc.
"column_dtypes" : dict of column_name -> numpy.dtype or 'category'
"cat_feature_indices" : list of integers
indices of categorical features in array of all features.
Note: indices in array of features, not indices in array of all columns!
"text_feature_indices" : list of integers
indices of text features in array of all features.
Note: indices in array of features, not indices in array of all columns!
"embedding_feature_indices" : list of integers
indices of embedding features in array of all features.
Note: indices in array of features, not indices in array of all columns!
"column_names" : list of strings
"non_feature_column_indices" : list of integers
"""
column_type_synonyms_map = {
'Target': 'Label',
'DocId': 'SampleId',
'QueryId': 'GroupId'
}
if column_count is None:
if data_file is None:
raise Exception(
'Cannot obtain column count: either specify column_count parameter or specify data_file '
+ 'parameter to get it'
)
with open(fspath(data_file)) as f:
column_count = len(f.readline()[:-1].split('\t'))
column_type_to_indices = {}
column_dtypes = {}
cat_feature_indices = []
text_feature_indices = []
embedding_feature_indices = []
column_names = []
non_feature_column_indices = []
# list of (column_idx, column_type, column_name) tuples, needed to support CD files with nonincreasing
# column indices
column_descriptions = []
with open(fspath(cd_file)) as f:
for line_idx, line in enumerate(f):
line = line.strip()
# some cd files in the wild contain empty lines
if len(line) == 0:
continue
line_columns = line.split('\t')
if len(line_columns) not in [2, 3]:
raise Exception('Wrong number of columns in cd file')
column_idx = int(line_columns[0])
column_type = line_columns[1]
column_name = None
if len(line_columns) == 3:
column_name = line_columns[2]
column_descriptions.append((column_idx, column_type, column_name))
column_descriptions.sort()
def add_missed_columns(start_column_idx, end_column_idx, non_feature_column_count):
for missed_column_idx in range(start_column_idx, end_column_idx):
column_name = 'feature_%i' % (missed_column_idx - non_feature_column_count)
column_names.append(column_name)
column_type_to_indices.setdefault('Num', []).append(missed_column_idx)
column_dtypes[column_name] = np.float32
last_column_idx = -1
for column_idx, column_type, column_name in column_descriptions:
if column_idx == last_column_idx:
raise Exception('Duplicate column indices in cd file')
add_missed_columns(last_column_idx + 1, column_idx, len(non_feature_column_indices))
if canonize_column_types:
column_type = column_type_synonyms_map.get(column_type, column_type)
column_type_to_indices.setdefault(column_type, []).append(column_idx)
if column_type in ['Num', 'Categ', 'Text', 'NumVector']:
feature_idx = column_idx - len(non_feature_column_indices)
if column_name is None:
column_name = 'feature_%i' % feature_idx
if column_type == 'Categ':
cat_feature_indices.append(feature_idx)
column_dtypes[column_name] = 'category'
elif column_type == 'Text':
text_feature_indices.append(feature_idx)
column_dtypes[column_name] = object
elif column_type == 'NumVector':
embedding_feature_indices.append(feature_idx)
column_dtypes[column_name] = object
else:
column_dtypes[column_name] = np.float32
else:
non_feature_column_indices.append(column_idx)
if column_name is None:
column_name = column_type
column_names.append(column_name)
last_column_idx = column_idx
add_missed_columns(last_column_idx + 1, column_count, len(non_feature_column_indices))
return {
'column_type_to_indices' : column_type_to_indices,
'column_dtypes' : column_dtypes,
'cat_feature_indices' : cat_feature_indices,
'text_feature_indices' : text_feature_indices,
'embedding_feature_indices' : embedding_feature_indices,
'column_names' : column_names,
'non_feature_column_indices' : non_feature_column_indices
}
def eval_metric(label, approx, metric, weight=None, group_id=None, group_weight=None, subgroup_id=None, pairs=None, thread_count=-1):
"""
Evaluate metrics with raw approxes and labels.
Parameters
----------
label : list or numpy.ndarrays or pandas.DataFrame or pandas.Series
Object labels with shape (n_objects,) or (n_object, n_target_dimension)
approx : list or numpy.ndarrays or pandas.DataFrame or pandas.Series
Object approxes with shape (n_objects,) or (n_object, n_approx_dimension).
metric : string
Metric name.
weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
Object weights.
group_id : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
Object group ids.
group_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
Group weights.
subgroup_id : list or numpy.ndarray, optional (default=None)
subgroup id for each instance.
If not None, giving 1 dimensional array like data.
pairs : list or numpy.ndarray or pandas.DataFrame or string or pathlib.Path
The pairs description.
If list or numpy.ndarrays or pandas.DataFrame, giving 2 dimensional.
The shape should be Nx2, where N is the pairs' count. The first element of the pair is
the index of winner object in the training set. The second element of the pair is
the index of loser object in the training set.
If string or pathlib.Path, giving the path to the file with pairs description.
thread_count : int, optional (default=-1)
Number of threads to work with.
If -1, then the number of threads is set to the number of CPU cores.
Returns
-------
metric results : list with metric values.
"""
if len(label) > 0:
label = np.transpose(label) if isinstance(label[0], ARRAY_TYPES) else [label]
if len(approx) == 0:
approx = [[]]
approx = np.transpose(approx) if isinstance(approx[0], ARRAY_TYPES) else [approx]
return _eval_metric_util(label, approx, metric, weight, group_id, group_weight, subgroup_id, pairs, thread_count)
def get_gpu_device_count():
return _catboost._get_gpu_device_count()
def reset_trace_backend(filename):
_catboost._reset_trace_backend(filename)
def get_confusion_matrix(model, data, thread_count=-1):
"""
Build confusion matrix.
Parameters
----------
model : catboost.CatBoost
The trained model.
data : catboost.Pool
A set of samples to build confusion matrix with.
thread_count : int (default=-1)
Number of threads to work with.
If -1, then the number of threads is set to the number of CPU cores.
Returns
-------
confusion matrix : array, shape = [n_classes, n_classes]
"""
if not isinstance(data, Pool):
raise CatBoostError('data must be a catboost.Pool')
return _get_confusion_matrix(model._object, data, thread_count)
def get_roc_curve(model, data, thread_count=-1, plot=False):
"""
Build points of ROC curve.
Parameters
----------
model : catboost.CatBoost
The trained model.
data : catboost.Pool or list of catboost.Pool
A set of samples to build ROC curve with.
thread_count : int (default=-1)
Number of threads to work with.
If -1, then the number of threads is set to the number of CPU cores.
plot : bool, optional (default=False)
If True, draw curve.
Returns
-------
curve points : tuple of three arrays (fpr, tpr, thresholds)
"""
if isinstance(data, Pool):
data = [data]
if not isinstance(data, list):
raise CatBoostError('data must be a catboost.Pool or list of pools.')
for pool in data:
if not isinstance(pool, Pool):
raise CatBoostError('one of data pools is not catboost.Pool')
roc_curve = _get_roc_curve(model._object, data, thread_count)
if plot:
with _import_matplotlib() as plt:
_draw(plt, roc_curve[0], roc_curve[1], 'False Positive Rate', 'True Positive Rate', 'ROC Curve')
return roc_curve
def get_fpr_curve(model=None, data=None, curve=None, thread_count=-1, plot=False):
"""
Build points of FPR curve.
Parameters
----------
model : catboost.CatBoost
The trained model.
data : catboost.Pool or list of catboost.Pool
A set of samples to build ROC curve with.
curve : tuple of three arrays (fpr, tpr, thresholds)
ROC curve points in format of get_roc_curve returned value.
If set, data parameter must not be set.
thread_count : int (default=-1)
Number of threads to work with.
If -1, then the number of threads is set to the number of CPU cores.
plot : bool, optional (default=False)
If True, draw curve.
Returns
-------
curve points : tuple of two arrays (thresholds, fpr)
"""
if curve is not None:
if data is not None:
raise CatBoostError('Only one of the parameters data and curve should be set.')
if not (isinstance(curve, list) or isinstance(curve, tuple)) or len(curve) != 3:
raise CatBoostError('curve must be list or tuple of three arrays (fpr, tpr, thresholds).')
fpr, thresholds = curve[0][:], curve[2][:]
else:
if model is None or data is None:
raise CatBoostError('model and data parameters should be set when curve parameter is None.')
fpr, _, thresholds = get_roc_curve(model, data, thread_count)
if plot:
with _import_matplotlib() as plt:
_draw(plt, thresholds, fpr, 'Thresholds', 'False Positive Rate', 'FPR Curve')
return thresholds, fpr
def get_fnr_curve(model=None, data=None, curve=None, thread_count=-1, plot=False):
"""
Build points of FNR curve.
Parameters
----------
model : catboost.CatBoost
The trained model.
data : catboost.Pool or list of catboost.Pool
A set of samples to build ROC curve with.
curve : tuple of three arrays (fpr, tpr, thresholds)
ROC curve points in format of get_roc_curve returned value.
If set, data parameter must not be set.
thread_count : int (default=-1)
Number of threads to work with.
If -1, then the number of threads is set to the number of CPU cores.
plot : bool, optional (default=False)
If True, draw curve.
Returns
-------
curve points : tuple of two arrays (thresholds, fnr)
"""
if curve is not None:
if data is not None:
raise CatBoostError('Only one of the parameters data and curve should be set.')
if not (isinstance(curve, list) or isinstance(curve, tuple)) or len(curve) != 3:
raise CatBoostError('curve must be list or tuple of three arrays (fpr, tpr, thresholds).')
tpr, thresholds = curve[1], curve[2][:]
else:
if model is None or data is None:
raise CatBoostError('model and data parameters should be set when curve parameter is None.')
_, tpr, thresholds = get_roc_curve(model, data, thread_count)
fnr = np.array([1 - x for x in tpr])
if plot:
with _import_matplotlib() as plt:
_draw(plt, thresholds, fnr, 'Thresholds', 'False Negative Rate', 'FNR Curve')
return thresholds, fnr
def select_threshold(model=None, data=None, curve=None, FPR=None, FNR=None, thread_count=-1):
"""
Selects a threshold for prediction.
Parameters
----------
model : catboost.CatBoost
The trained model.
data : catboost.Pool or list of catboost.Pool
Set of samples to build ROC curve with.
If set, curve parameter must not be set.
curve : tuple of three arrays (fpr, tpr, thresholds)
ROC curve points in format of get_roc_curve returned value.
If set, data parameter must not be set.
FPR : desired false-positive rate
FNR : desired false-negative rate (only one of FPR and FNR should be chosen)
thread_count : int (default=-1)
Number of threads to work with.
If -1, then the number of threads is set to the number of CPU cores.
Returns
-------
threshold : double
"""
if data is not None:
if curve is not None:
raise CatBoostError('Only one of the parameters data and curve should be set.')
if model is None:
raise CatBoostError('model and data parameters should be set when curve parameter is None.')
if isinstance(data, Pool):
data = [data]
if not isinstance(data, list):
raise CatBoostError('data must be a catboost.Pool or list of pools.')
for pool in data:
if not isinstance(pool, Pool):
raise CatBoostError('one of data pools is not catboost.Pool')
return _select_threshold(model._object, data, None, FPR, FNR, thread_count)
elif curve is not None:
if not (isinstance(curve, list) or isinstance(curve, tuple)) or len(curve) != 3:
raise CatBoostError('curve must be list or tuple of three arrays (fpr, tpr, thresholds).')
return _select_threshold(None, None, curve, FPR, FNR, thread_count)
else:
raise CatBoostError('One of the parameters data and curve should be set.')
def quantize(
data_path,
column_description=None,
pairs=None,
graph=None,
delimiter='\t',
has_header=False,
ignore_csv_quoting=False,
feature_names=None,
thread_count=-1,
ignored_features=None,
per_float_feature_quantization=None,
border_count=None,
max_bin=None,
feature_border_type=None,
nan_mode=None,
input_borders=None,
task_type=None,
used_ram_limit=None,
random_seed=None,
log_cout=sys.stdout,
log_cerr=sys.stderr,
**kwargs
):
"""
Construct quantized Pool from non-quantized pool stored in file.
This method does not load whole non-quantized source dataset into memory
so it can be used for huge datasets that fit in memory only after quantization.
Parameters
----------
data_path : string or pathlib.Path
Path (with optional scheme) to non-quantized dataset.
column_description : string, [default=None]
ColumnsDescription parameter.
There are several columns description types: Label, Categ, Num, Auxiliary, DocId, Weight, Baseline, GroupId, Timestamp.
All columns are Num as default, it's not necessary to specify
this type of columns. Default Label column index is 0 (zero).
If None, Label column is 0 (zero) as default, all data columns are Num as default.
If string or pathlib.Path, giving the path to the file with ColumnsDescription in column_description format.
pairs : string or pathlib.Path, [default=None]
Path to the file with pairs description.
graph : string or pathlib.Path, [default=None]
Path to the file with graph description.
has_header : bool, [default=False]
If True, read column names from first line.
ignore_csv_quoting : bool optional (default=False)
If True ignore quoting '"'.
feature_names : string or pathlib.Path, [default=None]
Path with scheme for feature names data to load.
thread_count : int, [default=-1]
Thread count for data processing.
If -1, then the number of threads is set to the number of CPU cores.
ignored_features : list, [default=None]
Indices or names of features that should be excluded when training.
per_float_feature_quantization : list of strings, [default=None]
List of float binarization descriptions.
Format : described in documentation on catboost.ai
Example 1: ['0:1024'] means that feature 0 will have 1024 borders.
Example 2: ['0:border_count=1024', '1:border_count=1024', ...] means that two first features have 1024 borders.
Example 3: ['0:nan_mode=Forbidden,border_count=32,border_type=GreedyLogSum',
'1:nan_mode=Forbidden,border_count=32,border_type=GreedyLogSum'] - defines more quantization properties for first two features.
border_count : int, [default = 254 for training on CPU or 128 for training on GPU]
The number of partitions in numeric features binarization. Used in the preliminary calculation.
range: [1,65535] on CPU, [1,255] on GPU
max_bin : float, synonym for border_count.
feature_border_type : string, [default='GreedyLogSum']
The binarization mode in numeric features binarization. Used in the preliminary calculation.
Possible values:
- 'Median'
- 'Uniform'
- 'UniformAndQuantiles'
- 'GreedyLogSum'
- 'MaxLogSum'
- 'MinEntropy'
nan_mode : string, [default=None]
Way to process missing values for numeric features.
Possible values:
- 'Forbidden' - raises an exception if there is a missing value for a numeric feature in a dataset.
- 'Min' - each missing value will be processed as the minimum numerical value.
- 'Max' - each missing value will be processed as the maximum numerical value.
If None, then nan_mode=Min.
input_borders : string or pathlib.Path, [default=None]
input file with borders used in numeric features binarization.
task_type : string, [default=None]
The calcer type used to train the model.
Possible values:
- 'CPU'
- 'GPU'
used_ram_limit=None
random_seed : int, [default=None]
The random seed used for data sampling.
If None, 0 is used.
Returns
-------
pool : Pool
Constructed and quantized pool.
"""
if not data_path:
raise CatBoostError("Data filename is empty.")
if not isinstance(data_path, PATH_TYPES):
raise CatBoostError("Data filename should be string or pathlib.Path type.")
if pairs is not None and not isinstance(pairs, PATH_TYPES):
raise CatBoostError("pairs should have None or string or pathlib.Path type when the pool is read from the file.")
if column_description is not None and not isinstance(column_description, PATH_TYPES):
raise CatBoostError("column_description should have None or string or pathlib.Path type when the pool is read from the file.")
if feature_names is not None and not isinstance(feature_names, PATH_TYPES):
raise CatBoostError("feature_names should have None or string or pathlib.Path type when the pool is read from the file.")
params = {}
_process_synonyms(params)
if border_count is None:
border_count = max_bin
if 'dev_block_size' in kwargs:
params['dev_block_size'] = kwargs.pop('dev_block_size')
dev_max_subset_size_for_build_borders = kwargs.pop('dev_max_subset_size_for_build_borders', None)
if kwargs:
raise CatBoostError("got an unexpected keyword arguments: {}".format(kwargs.keys()))
_update_params_quantize_part(
params,
ignored_features,
per_float_feature_quantization,
border_count,
feature_border_type,
None, # sparse_features_conflict_fraction
None, # dev_efb_max_buckets
nan_mode,
input_borders,
task_type,
used_ram_limit,
random_seed,
dev_max_subset_size_for_build_borders
)
pool = Pool(
data_path,
column_description=column_description,
pairs=pairs,
graph=graph,
feature_names=feature_names,
delimiter=delimiter,
has_header=has_header,
ignore_csv_quoting=ignore_csv_quoting,
thread_count=thread_count,
log_cout=log_cout,
log_cerr=log_cerr
)
pool._read(
data_path,
column_description,
pairs,
graph,
feature_names,
delimiter,
has_header,
ignore_csv_quoting,
thread_count,
params,
log_cout=log_cout,
log_cerr=log_cerr
)
return pool
def convert_to_onnx_object(model, export_parameters=None, **kwargs):
"""
Convert given CatBoost model to ONNX-ML model.
Categorical Features are not supported.
Parameters
----------
model : CatBoost trained model
export_parameters : dict [default=None]
Parameters for ONNX-ML export:
* onnx_graph_name : string
The name property of onnx Graph
* onnx_domain : string
The domain component of onnx Model
* onnx_model_version : int
The model_version component of onnx Model
* onnx_doc_string : string
The doc_string component of onnx Model
Returns
-------
onnx_object : ModelProto
The model in ONNX format
"""
try:
import onnx
except ImportError as e:
warnings.warn("To get working onnx model you should install onnx.")
raise ImportError(str(e))
import json
if not model.is_fitted():
raise CatBoostError(
"There is no trained model to use save_model(). Use fit() to train model. Then use this method.")
for name, value in kwargs.items():
if name == 'target_opset' and value not in [None, 2]:
warnings.warn('target_opset argument is not supported. Default target_opset is 2 (ai.onnx.ml domain)')
elif name == 'initial_types' and value is not None:
warnings.warn('initial_types argument is not supported')
params_string = ""
if export_parameters:
params_string = json.dumps(export_parameters, cls=_NumpyAwareEncoder)
model_str = _get_onnx_model(model._object, params_string)
onnx_model = onnx.load_model_from_string(model_str)
return onnx_model
def calculate_quantization_grid(values, border_count, border_type='Median'):
assert border_count > 0, 'Border count should be > 0'
return _calculate_quantization_grid(values, border_count, border_type)