from . import _catboost from .core import Pool, CatBoostError, ARRAY_TYPES, PATH_TYPES, fspath, _update_params_quantize_part, _process_synonyms from collections import defaultdict from contextlib import contextmanager import sys import numpy as np import warnings _eval_metric_util = _catboost._eval_metric_util _get_roc_curve = _catboost._get_roc_curve _get_confusion_matrix = _catboost._get_confusion_matrix _select_threshold = _catboost._select_threshold _NumpyAwareEncoder = _catboost._NumpyAwareEncoder _get_onnx_model = _catboost._get_onnx_model _calculate_quantization_grid = _catboost._calculate_quantization_grid compute_wx_test = _catboost.compute_wx_test TargetStats = _catboost.TargetStats DataMetaInfo = _catboost.DataMetaInfo compute_training_options = _catboost.compute_training_options @contextmanager def _import_matplotlib(): try: import matplotlib.pyplot as plt except ImportError as e: warnings.warn("To draw plots you should install matplotlib.") raise ImportError(str(e)) yield plt def _draw(plt, x, y, x_label, y_label, title): plt.figure(figsize=(16, 8)) plt.plot(x, y, alpha=0.5, lw=2) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.grid(True) plt.xlabel(x_label, fontsize=16) plt.ylabel(y_label, fontsize=16) plt.title(title, fontsize=20) plt.show() def create_cd( label=None, cat_features=None, text_features=None, embedding_features=None, weight=None, baseline=None, doc_id=None, group_id=None, subgroup_id=None, timestamp=None, auxiliary_columns=None, feature_names=None, output_path='train.cd' ): _from_param_to_cd = { 'label': 'Label', 'weight': 'Weight', 'baseline': 'Baseline', 'doc_id': 'DocId', 'group_id': 'GroupId', 'subgroup_id': 'SubgroupId', 'timestamp': 'Timestamp' } _column_description = defaultdict(lambda: ['Num', '']) for key, value in locals().copy().items(): if not (key.startswith('_') or value is None): if key in ('cat_features', 'text_features', 'embedding_features', 'auxiliary_columns'): if isinstance(value, int): value = [value] for index in value: if not isinstance(index, int): raise CatBoostError('Unsupported index type. Expected int, got {}'.format(type(index))) if index in _column_description: raise CatBoostError('The index {} occurs more than once'.format(index)) if key == 'cat_features': _column_description[index] = ['Categ', ''] elif key == 'text_features': _column_description[index] = ['Text', ''] elif key == 'embedding_features': _column_description[index] = ['NumVector', ''] else: _column_description[index] = ['Auxiliary', ''] elif key not in ('feature_names', 'output_path'): if not isinstance(value, int): raise CatBoostError('Unsupported index type. Expected int, got {}'.format(type(value))) if value in _column_description: raise CatBoostError('The index {} occurs more than once'.format(value)) _column_description[value] = [_from_param_to_cd[key], ''] if feature_names is not None: for feature_column_index, name in feature_names.items(): if _column_description[feature_column_index][0] not in ('Num', 'Categ', 'Text', 'NumVector'): raise CatBoostError('feature_names contains index {} that does not correspond to feature column'.format(feature_column_index)) _column_description[feature_column_index][1] = name with open(fspath(output_path), 'w') as f: for index, (title, name) in sorted(_column_description.items()): f.write('{}\t{}\t{}\n'.format(index, title, name)) def read_cd(cd_file, column_count=None, data_file=None, canonize_column_types=False): """ Reads CatBoost column description file (see https://catboost.ai/docs/concepts/input-data_column-descfile.html#input-data_column-descfile) Parameters ---------- cd_file : str or pathlib.Path path to column description file column_count : integer total number of columns data_file : str or pathlib.Path path to dataset file in CatBoost format specify either column_count directly or data_file to detect it canonize_column_types : bool if set to True types for columns with synonyms are renamed to canonical type. Returns ------- dict with keys: "column_type_to_indices" : dict of column_type -> column_indices list, column_type is 'Label', 'Categ' etc. "column_dtypes" : dict of column_name -> numpy.dtype or 'category' "cat_feature_indices" : list of integers indices of categorical features in array of all features. Note: indices in array of features, not indices in array of all columns! "text_feature_indices" : list of integers indices of text features in array of all features. Note: indices in array of features, not indices in array of all columns! "embedding_feature_indices" : list of integers indices of embedding features in array of all features. Note: indices in array of features, not indices in array of all columns! "column_names" : list of strings "non_feature_column_indices" : list of integers """ column_type_synonyms_map = { 'Target': 'Label', 'DocId': 'SampleId', 'QueryId': 'GroupId' } if column_count is None: if data_file is None: raise Exception( 'Cannot obtain column count: either specify column_count parameter or specify data_file ' + 'parameter to get it' ) with open(fspath(data_file)) as f: column_count = len(f.readline()[:-1].split('\t')) column_type_to_indices = {} column_dtypes = {} cat_feature_indices = [] text_feature_indices = [] embedding_feature_indices = [] column_names = [] non_feature_column_indices = [] # list of (column_idx, column_type, column_name) tuples, needed to support CD files with nonincreasing # column indices column_descriptions = [] with open(fspath(cd_file)) as f: for line_idx, line in enumerate(f): line = line.strip() # some cd files in the wild contain empty lines if len(line) == 0: continue line_columns = line.split('\t') if len(line_columns) not in [2, 3]: raise Exception('Wrong number of columns in cd file') column_idx = int(line_columns[0]) column_type = line_columns[1] column_name = None if len(line_columns) == 3: column_name = line_columns[2] column_descriptions.append((column_idx, column_type, column_name)) column_descriptions.sort() def add_missed_columns(start_column_idx, end_column_idx, non_feature_column_count): for missed_column_idx in range(start_column_idx, end_column_idx): column_name = 'feature_%i' % (missed_column_idx - non_feature_column_count) column_names.append(column_name) column_type_to_indices.setdefault('Num', []).append(missed_column_idx) column_dtypes[column_name] = np.float32 last_column_idx = -1 for column_idx, column_type, column_name in column_descriptions: if column_idx == last_column_idx: raise Exception('Duplicate column indices in cd file') add_missed_columns(last_column_idx + 1, column_idx, len(non_feature_column_indices)) if canonize_column_types: column_type = column_type_synonyms_map.get(column_type, column_type) column_type_to_indices.setdefault(column_type, []).append(column_idx) if column_type in ['Num', 'Categ', 'Text', 'NumVector']: feature_idx = column_idx - len(non_feature_column_indices) if column_name is None: column_name = 'feature_%i' % feature_idx if column_type == 'Categ': cat_feature_indices.append(feature_idx) column_dtypes[column_name] = 'category' elif column_type == 'Text': text_feature_indices.append(feature_idx) column_dtypes[column_name] = object elif column_type == 'NumVector': embedding_feature_indices.append(feature_idx) column_dtypes[column_name] = object else: column_dtypes[column_name] = np.float32 else: non_feature_column_indices.append(column_idx) if column_name is None: column_name = column_type column_names.append(column_name) last_column_idx = column_idx add_missed_columns(last_column_idx + 1, column_count, len(non_feature_column_indices)) return { 'column_type_to_indices' : column_type_to_indices, 'column_dtypes' : column_dtypes, 'cat_feature_indices' : cat_feature_indices, 'text_feature_indices' : text_feature_indices, 'embedding_feature_indices' : embedding_feature_indices, 'column_names' : column_names, 'non_feature_column_indices' : non_feature_column_indices } def eval_metric(label, approx, metric, weight=None, group_id=None, group_weight=None, subgroup_id=None, pairs=None, thread_count=-1): """ Evaluate metrics with raw approxes and labels. Parameters ---------- label : list or numpy.ndarrays or pandas.DataFrame or pandas.Series Object labels with shape (n_objects,) or (n_object, n_target_dimension) approx : list or numpy.ndarrays or pandas.DataFrame or pandas.Series Object approxes with shape (n_objects,) or (n_object, n_approx_dimension). metric : string Metric name. weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None) Object weights. group_id : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None) Object group ids. group_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None) Group weights. subgroup_id : list or numpy.ndarray, optional (default=None) subgroup id for each instance. If not None, giving 1 dimensional array like data. pairs : list or numpy.ndarray or pandas.DataFrame or string or pathlib.Path The pairs description. If list or numpy.ndarrays or pandas.DataFrame, giving 2 dimensional. The shape should be Nx2, where N is the pairs' count. The first element of the pair is the index of winner object in the training set. The second element of the pair is the index of loser object in the training set. If string or pathlib.Path, giving the path to the file with pairs description. thread_count : int, optional (default=-1) Number of threads to work with. If -1, then the number of threads is set to the number of CPU cores. Returns ------- metric results : list with metric values. """ if len(label) > 0: label = np.transpose(label) if isinstance(label[0], ARRAY_TYPES) else [label] if len(approx) == 0: approx = [[]] approx = np.transpose(approx) if isinstance(approx[0], ARRAY_TYPES) else [approx] return _eval_metric_util(label, approx, metric, weight, group_id, group_weight, subgroup_id, pairs, thread_count) def get_gpu_device_count(): return _catboost._get_gpu_device_count() def reset_trace_backend(filename): _catboost._reset_trace_backend(filename) def get_confusion_matrix(model, data, thread_count=-1): """ Build confusion matrix. Parameters ---------- model : catboost.CatBoost The trained model. data : catboost.Pool A set of samples to build confusion matrix with. thread_count : int (default=-1) Number of threads to work with. If -1, then the number of threads is set to the number of CPU cores. Returns ------- confusion matrix : array, shape = [n_classes, n_classes] """ if not isinstance(data, Pool): raise CatBoostError('data must be a catboost.Pool') return _get_confusion_matrix(model._object, data, thread_count) def get_roc_curve(model, data, thread_count=-1, plot=False): """ Build points of ROC curve. Parameters ---------- model : catboost.CatBoost The trained model. data : catboost.Pool or list of catboost.Pool A set of samples to build ROC curve with. thread_count : int (default=-1) Number of threads to work with. If -1, then the number of threads is set to the number of CPU cores. plot : bool, optional (default=False) If True, draw curve. Returns ------- curve points : tuple of three arrays (fpr, tpr, thresholds) """ if isinstance(data, Pool): data = [data] if not isinstance(data, list): raise CatBoostError('data must be a catboost.Pool or list of pools.') for pool in data: if not isinstance(pool, Pool): raise CatBoostError('one of data pools is not catboost.Pool') roc_curve = _get_roc_curve(model._object, data, thread_count) if plot: with _import_matplotlib() as plt: _draw(plt, roc_curve[0], roc_curve[1], 'False Positive Rate', 'True Positive Rate', 'ROC Curve') return roc_curve def get_fpr_curve(model=None, data=None, curve=None, thread_count=-1, plot=False): """ Build points of FPR curve. Parameters ---------- model : catboost.CatBoost The trained model. data : catboost.Pool or list of catboost.Pool A set of samples to build ROC curve with. curve : tuple of three arrays (fpr, tpr, thresholds) ROC curve points in format of get_roc_curve returned value. If set, data parameter must not be set. thread_count : int (default=-1) Number of threads to work with. If -1, then the number of threads is set to the number of CPU cores. plot : bool, optional (default=False) If True, draw curve. Returns ------- curve points : tuple of two arrays (thresholds, fpr) """ if curve is not None: if data is not None: raise CatBoostError('Only one of the parameters data and curve should be set.') if not (isinstance(curve, list) or isinstance(curve, tuple)) or len(curve) != 3: raise CatBoostError('curve must be list or tuple of three arrays (fpr, tpr, thresholds).') fpr, thresholds = curve[0][:], curve[2][:] else: if model is None or data is None: raise CatBoostError('model and data parameters should be set when curve parameter is None.') fpr, _, thresholds = get_roc_curve(model, data, thread_count) if plot: with _import_matplotlib() as plt: _draw(plt, thresholds, fpr, 'Thresholds', 'False Positive Rate', 'FPR Curve') return thresholds, fpr def get_fnr_curve(model=None, data=None, curve=None, thread_count=-1, plot=False): """ Build points of FNR curve. Parameters ---------- model : catboost.CatBoost The trained model. data : catboost.Pool or list of catboost.Pool A set of samples to build ROC curve with. curve : tuple of three arrays (fpr, tpr, thresholds) ROC curve points in format of get_roc_curve returned value. If set, data parameter must not be set. thread_count : int (default=-1) Number of threads to work with. If -1, then the number of threads is set to the number of CPU cores. plot : bool, optional (default=False) If True, draw curve. Returns ------- curve points : tuple of two arrays (thresholds, fnr) """ if curve is not None: if data is not None: raise CatBoostError('Only one of the parameters data and curve should be set.') if not (isinstance(curve, list) or isinstance(curve, tuple)) or len(curve) != 3: raise CatBoostError('curve must be list or tuple of three arrays (fpr, tpr, thresholds).') tpr, thresholds = curve[1], curve[2][:] else: if model is None or data is None: raise CatBoostError('model and data parameters should be set when curve parameter is None.') _, tpr, thresholds = get_roc_curve(model, data, thread_count) fnr = np.array([1 - x for x in tpr]) if plot: with _import_matplotlib() as plt: _draw(plt, thresholds, fnr, 'Thresholds', 'False Negative Rate', 'FNR Curve') return thresholds, fnr def select_threshold(model=None, data=None, curve=None, FPR=None, FNR=None, thread_count=-1): """ Selects a threshold for prediction. Parameters ---------- model : catboost.CatBoost The trained model. data : catboost.Pool or list of catboost.Pool Set of samples to build ROC curve with. If set, curve parameter must not be set. curve : tuple of three arrays (fpr, tpr, thresholds) ROC curve points in format of get_roc_curve returned value. If set, data parameter must not be set. FPR : desired false-positive rate FNR : desired false-negative rate (only one of FPR and FNR should be chosen) thread_count : int (default=-1) Number of threads to work with. If -1, then the number of threads is set to the number of CPU cores. Returns ------- threshold : double """ if data is not None: if curve is not None: raise CatBoostError('Only one of the parameters data and curve should be set.') if model is None: raise CatBoostError('model and data parameters should be set when curve parameter is None.') if isinstance(data, Pool): data = [data] if not isinstance(data, list): raise CatBoostError('data must be a catboost.Pool or list of pools.') for pool in data: if not isinstance(pool, Pool): raise CatBoostError('one of data pools is not catboost.Pool') return _select_threshold(model._object, data, None, FPR, FNR, thread_count) elif curve is not None: if not (isinstance(curve, list) or isinstance(curve, tuple)) or len(curve) != 3: raise CatBoostError('curve must be list or tuple of three arrays (fpr, tpr, thresholds).') return _select_threshold(None, None, curve, FPR, FNR, thread_count) else: raise CatBoostError('One of the parameters data and curve should be set.') def quantize( data_path, column_description=None, pairs=None, graph=None, delimiter='\t', has_header=False, ignore_csv_quoting=False, feature_names=None, thread_count=-1, ignored_features=None, per_float_feature_quantization=None, border_count=None, max_bin=None, feature_border_type=None, nan_mode=None, input_borders=None, task_type=None, used_ram_limit=None, random_seed=None, log_cout=sys.stdout, log_cerr=sys.stderr, **kwargs ): """ Construct quantized Pool from non-quantized pool stored in file. This method does not load whole non-quantized source dataset into memory so it can be used for huge datasets that fit in memory only after quantization. Parameters ---------- data_path : string or pathlib.Path Path (with optional scheme) to non-quantized dataset. column_description : string, [default=None] ColumnsDescription parameter. There are several columns description types: Label, Categ, Num, Auxiliary, DocId, Weight, Baseline, GroupId, Timestamp. All columns are Num as default, it's not necessary to specify this type of columns. Default Label column index is 0 (zero). If None, Label column is 0 (zero) as default, all data columns are Num as default. If string or pathlib.Path, giving the path to the file with ColumnsDescription in column_description format. pairs : string or pathlib.Path, [default=None] Path to the file with pairs description. graph : string or pathlib.Path, [default=None] Path to the file with graph description. has_header : bool, [default=False] If True, read column names from first line. ignore_csv_quoting : bool optional (default=False) If True ignore quoting '"'. feature_names : string or pathlib.Path, [default=None] Path with scheme for feature names data to load. thread_count : int, [default=-1] Thread count for data processing. If -1, then the number of threads is set to the number of CPU cores. ignored_features : list, [default=None] Indices or names of features that should be excluded when training. per_float_feature_quantization : list of strings, [default=None] List of float binarization descriptions. Format : described in documentation on catboost.ai Example 1: ['0:1024'] means that feature 0 will have 1024 borders. Example 2: ['0:border_count=1024', '1:border_count=1024', ...] means that two first features have 1024 borders. Example 3: ['0:nan_mode=Forbidden,border_count=32,border_type=GreedyLogSum', '1:nan_mode=Forbidden,border_count=32,border_type=GreedyLogSum'] - defines more quantization properties for first two features. border_count : int, [default = 254 for training on CPU or 128 for training on GPU] The number of partitions in numeric features binarization. Used in the preliminary calculation. range: [1,65535] on CPU, [1,255] on GPU max_bin : float, synonym for border_count. feature_border_type : string, [default='GreedyLogSum'] The binarization mode in numeric features binarization. Used in the preliminary calculation. Possible values: - 'Median' - 'Uniform' - 'UniformAndQuantiles' - 'GreedyLogSum' - 'MaxLogSum' - 'MinEntropy' nan_mode : string, [default=None] Way to process missing values for numeric features. Possible values: - 'Forbidden' - raises an exception if there is a missing value for a numeric feature in a dataset. - 'Min' - each missing value will be processed as the minimum numerical value. - 'Max' - each missing value will be processed as the maximum numerical value. If None, then nan_mode=Min. input_borders : string or pathlib.Path, [default=None] input file with borders used in numeric features binarization. task_type : string, [default=None] The calcer type used to train the model. Possible values: - 'CPU' - 'GPU' used_ram_limit=None random_seed : int, [default=None] The random seed used for data sampling. If None, 0 is used. Returns ------- pool : Pool Constructed and quantized pool. """ if not data_path: raise CatBoostError("Data filename is empty.") if not isinstance(data_path, PATH_TYPES): raise CatBoostError("Data filename should be string or pathlib.Path type.") if pairs is not None and not isinstance(pairs, PATH_TYPES): raise CatBoostError("pairs should have None or string or pathlib.Path type when the pool is read from the file.") if column_description is not None and not isinstance(column_description, PATH_TYPES): raise CatBoostError("column_description should have None or string or pathlib.Path type when the pool is read from the file.") if feature_names is not None and not isinstance(feature_names, PATH_TYPES): raise CatBoostError("feature_names should have None or string or pathlib.Path type when the pool is read from the file.") params = {} _process_synonyms(params) if border_count is None: border_count = max_bin if 'dev_block_size' in kwargs: params['dev_block_size'] = kwargs.pop('dev_block_size') dev_max_subset_size_for_build_borders = kwargs.pop('dev_max_subset_size_for_build_borders', None) if kwargs: raise CatBoostError("got an unexpected keyword arguments: {}".format(kwargs.keys())) _update_params_quantize_part( params, ignored_features, per_float_feature_quantization, border_count, feature_border_type, None, # sparse_features_conflict_fraction None, # dev_efb_max_buckets nan_mode, input_borders, task_type, used_ram_limit, random_seed, dev_max_subset_size_for_build_borders ) pool = Pool( data_path, column_description=column_description, pairs=pairs, graph=graph, feature_names=feature_names, delimiter=delimiter, has_header=has_header, ignore_csv_quoting=ignore_csv_quoting, thread_count=thread_count, log_cout=log_cout, log_cerr=log_cerr ) pool._read( data_path, column_description, pairs, graph, feature_names, delimiter, has_header, ignore_csv_quoting, thread_count, params, log_cout=log_cout, log_cerr=log_cerr ) return pool def convert_to_onnx_object(model, export_parameters=None, **kwargs): """ Convert given CatBoost model to ONNX-ML model. Categorical Features are not supported. Parameters ---------- model : CatBoost trained model export_parameters : dict [default=None] Parameters for ONNX-ML export: * onnx_graph_name : string The name property of onnx Graph * onnx_domain : string The domain component of onnx Model * onnx_model_version : int The model_version component of onnx Model * onnx_doc_string : string The doc_string component of onnx Model Returns ------- onnx_object : ModelProto The model in ONNX format """ try: import onnx except ImportError as e: warnings.warn("To get working onnx model you should install onnx.") raise ImportError(str(e)) import json if not model.is_fitted(): raise CatBoostError( "There is no trained model to use save_model(). Use fit() to train model. Then use this method.") for name, value in kwargs.items(): if name == 'target_opset' and value not in [None, 2]: warnings.warn('target_opset argument is not supported. Default target_opset is 2 (ai.onnx.ml domain)') elif name == 'initial_types' and value is not None: warnings.warn('initial_types argument is not supported') params_string = "" if export_parameters: params_string = json.dumps(export_parameters, cls=_NumpyAwareEncoder) model_str = _get_onnx_model(model._object, params_string) onnx_model = onnx.load_model_from_string(model_str) return onnx_model def calculate_quantization_grid(values, border_count, border_type='Median'): assert border_count > 0, 'Border count should be > 0' return _calculate_quantization_grid(values, border_count, border_type)