Files
MLPproject/.venv/lib/python3.12/site-packages/catboost/datasets.py
2025-10-23 15:44:32 +02:00

337 lines
14 KiB
Python

import hashlib
import logging
import numpy as np
import os
import pandas as pd
import tarfile
import tempfile
import six
import shutil
from .core import PATH_TYPES, fspath
logger = logging.getLogger(__name__)
def _extract(src_file, dst_dir='.'):
cur_dir = os.getcwd()
os.chdir(dst_dir)
try:
with tarfile.open(src_file, 'r:gz') as f:
f.extractall()
finally:
os.chdir(cur_dir)
def _calc_md5(path):
hasher = hashlib.md5()
with open(path, 'rb') as f:
while True:
block = f.read(65536)
if not block:
break
hasher.update(block)
return hasher.hexdigest()
def _ensure_dir_exists(path):
try:
os.makedirs(path)
except OSError:
if not os.path.isdir(path):
raise
def _cached_download(url, md5, dst):
if os.path.isfile(dst) and _calc_md5(dst) == md5:
return
def reporthook(blocknum, bs, size):
logger.debug('downloaded %s bytes', size)
urls = url if isinstance(url, list) or isinstance(url, tuple) else (url, )
for u in urls:
try:
six.moves.urllib.request.urlretrieve(u, dst, reporthook=reporthook)
break
except (six.moves.urllib.error.URLError, IOError):
logger.debug('failed to download from %s', u)
else:
raise RuntimeError('failed to download from %s', urls)
dst_md5 = _calc_md5(dst)
if dst_md5 != md5:
raise RuntimeError('md5 sum mismatch for url {url}; expected {expected}, but got {got}'.format(
url=u, expected=md5, got=dst_md5))
_cache_path = None
def _get_cache_path():
global _cache_path
if _cache_path is None:
_cache_path = os.path.join(os.getcwd(), 'catboost_cached_datasets')
return _cache_path
def set_cache_path(path):
assert isinstance(path, PATH_TYPES), 'expected string or pathlib.Path'
global _cache_path
_cache_path = fspath(path)
def _download_dataset(url, md5, dataset_name, train_file, test_file, cache=False):
# TODO(yazevnul): this is not thread safe (or process safe?), we should take a file lock when
# enter this function to avoid dataset being overwritten or corrupted or something else that may
# have happen when OS operated simultaneously on the same file. Same thing should probably be
# done with `_cached_download`.
dir_path = os.path.join(_get_cache_path(), dataset_name) if cache else tempfile.mkdtemp()
train_path = os.path.join(dir_path, train_file)
test_path = os.path.join(dir_path, test_file)
if not (os.path.exists(train_path) and os.path.exists(test_path)):
_ensure_dir_exists(dir_path)
file_descriptor, file_path = tempfile.mkstemp()
os.close(file_descriptor)
try:
_cached_download(url, md5, file_path)
_extract(file_path, dir_path)
finally:
os.remove(file_path)
# move files for safe delete of temp dir
if not cache:
fd_new_train, new_train_path = tempfile.mkstemp()
fd_new_test, new_test_path = tempfile.mkstemp()
os.close(fd_new_train)
os.close(fd_new_test)
os.replace(train_path, new_train_path)
os.replace(test_path, new_test_path)
shutil.rmtree(dir_path)
train_path, test_path = new_train_path, new_test_path
return train_path, test_path
def _load_dataset_pd(url, md5, dataset_name, train_file, test_file, sep=',', header='infer', cache=False):
train_path, test_path = _download_dataset(url, md5, dataset_name, train_file, test_file, cache)
train, test = pd.read_csv(train_path, header=header, sep=sep), pd.read_csv(test_path, header=header, sep=sep)
if not cache:
os.remove(train_path)
os.remove(test_path)
return train, test
def _load_numeric_only_dataset(path, row_count, column_count, sep='\t'):
# - can't use `pandas.read_csv` because it may result in 5x overhead
# - can't use `numpy.loadtxt` because it may result in 3x overhead
# And both mentioned above solutions are very slow compared to the one implemented below.
dataset = np.zeros((row_count, column_count, ), dtype=np.float32, order='F')
with open(path, 'rb') as f:
for line_idx, line in enumerate(f):
# `str.split()` is too slow, use `numpy.fromstring()`
row = np.fromstring(line, dtype=np.float32, sep=sep)
assert row.size == column_count, 'got too many columns at line %d (expected %d columns, got %d)' % (line_idx + 1, column_count, row.size)
# doing `dataset[line_idx][:]` instead of `dataset[line_idx]` is here on purpose,
# otherwise we may reallocate memory, while here we just copy
dataset[line_idx][:] = row
assert line_idx + 1 == row_count, 'got too many lines (expected %d lines, got %d)' % (row_count, line_idx + 1)
return pd.DataFrame(dataset)
def titanic():
url = 'https://storage.mds.yandex.net/get-devtools-opensource/233854/titanic.tar.gz'
md5 = '9c8bc61d545c6af244a1d37494df3fc3'
dataset_name, train_file, test_file = 'titanic', 'train.csv', 'test.csv'
return _load_dataset_pd(url, md5, dataset_name, train_file, test_file)
def amazon():
url = 'https://storage.mds.yandex.net/get-devtools-opensource/250854/amazon.tar.gz'
md5 = '8fe3eec12bfd9c4c532b24a181d0aa2c'
dataset_name, train_file, test_file = 'amazon', 'train.csv', 'test.csv'
return _load_dataset_pd(url, md5, dataset_name, train_file, test_file)
def msrank():
url = 'https://storage.mds.yandex.net/get-devtools-opensource/233854/msrank.tar.gz'
md5 = '34fee225d02419adc106581f4eb36f2e'
dataset_name, train_file, test_file = 'msrank', 'train.tsv', 'test.tsv'
return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, header=None, sep='\t', cache=True)
def msrank_10k():
url = 'https://storage.mds.yandex.net/get-devtools-opensource/250854/msrank_10k.tar.gz'
md5 = '79c5b67397289c4c8b367c1f34629eae'
dataset_name, train_file, test_file = 'msrank_10k', 'train.csv', 'test.csv'
return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, header=None)
def rotten_tomatoes():
"""
Contains information from kaggle [1], which is made available here under the Open Database License (ODbL) [2].
Download "rotten_tomatoes" [1] data set.
Will return two pandas.DataFrame-s, first with train part (rotten_tomatoes.data) and second with test part
(rotten_tomatoes.test) of the dataset.
NOTE: This is a preprocessed version of the dataset.
[1]: https://www.kaggle.com/rpnuser8182/rotten-tomatoes
[2]: https://opendatacommons.org/licenses/odbl/1-0/index.html
"""
url = 'https://catboost-opensource.s3.yandex.net/rotten_tomatoes.tar.gz'
md5 = 'a07fed612805ac9e17ced0d82a96add4'
dataset_name, train_file, test_file = 'rotten_tomatoes', 'learn.tsv', 'test.tsv'
return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, sep='\t')
def imdb():
url = 'https://catboost-opensource.s3.yandex.net/imdb.tar.gz'
md5 = '0fd62578d631ac3d71a71c3e6ced6f8b'
dataset_name, train_file, test_file = 'imdb', 'learn.tsv', 'test.tsv'
return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, sep='\t')
def epsilon():
"""
Download "epsilon" [1] data set.
Will return two pandas.DataFrame-s, first with train part (epsilon_normalized) and second with
test part (epsilon_normalized.t) of the dataset. Object class will be located in the first
column of dataset.
NOTE: This is a preprocessed version of the dataset. It was converted from libsvm format into
tsv (CatBoost doesn't support libsvm format out of the box).
[1]: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#epsilon
"""
urls = (
'https://proxy.sandbox.yandex-team.ru/785711439',
'https://storage.mds.yandex.net/get-devtools-opensource/250854/epsilon.tar.gz', )
md5 = '5bbfac403ac673da7d7ee84bd532e973'
dataset_name, train_file, test_file = 'epsilon', 'train.tsv', 'test.tsv'
train_path, test_path = _download_dataset(urls, md5, dataset_name, train_file, test_file, cache=True)
return (
_load_numeric_only_dataset(train_path, 400000, 2001, sep='\t'),
_load_numeric_only_dataset(test_path, 100000, 2001, sep='\t'))
def monotonic1():
"""
Dataset with monotonic constraints.
Can be used for poisson regression.
Has several numerical and several categorical features.
The first column contains target values. Columns with names Cat* contain categorical features.
Columns with names Num* contain numerical features.
Dataset also contains several numerical features, for which monotonic constraints must hold.
For features in columns named MonotonicNeg*, if feature value decreases, then prediction value must not decrease.
Thus, if there are two samples x1, x2 with all features being equal except
for a monotonic negative feature M, such that x1[M] > x2[M], then the following inequality must
hold for predictions: f(x1) <= f(x2)
"""
url = 'https://storage.mds.yandex.net/get-devtools-opensource/479623/monotonic1.tar.gz'
md5 = '1b9d8e15bc3fd6f1498e652e7fc4f4ca'
dataset_name, train_file, test_file = 'monotonic1', 'train.tsv', 'test.tsv'
return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, sep='\t', cache=True)
def monotonic2():
"""
Dataset with monotonic constraints.
Can be used for regression.
The first column contains target values.
Other columns contain contain numerical features, for which monotonic constraints must hold.
For features in columns named MonotonicNeg*, if feature value decreases, then prediction
value must not decrease. Thus, if there are two samples x1, x2 with all features being
equal except for a monotonic negative feature MNeg, such that x1[MNeg] > x2[MNeg], then
the following inequality must hold for predictions: f(x1) <= f(x2)
For features in columns named MonotonicPos*, if feature value decreases, then prediction
value must not increase. Thus, if there are two samples x1, x2 with all features being
equal except for a monotonic positive feature MPos, such that x1[MPos] > x2[MPos],
then the following inequality must hold for predictions: f(x1) >= f(x2)
"""
url = 'https://storage.mds.yandex.net/get-devtools-opensource/250854/monotonic2.tar.gz'
md5 = 'ce559e212cb72c156269f6f9a641baca'
dataset_name, train_file, test_file = 'monotonic2', 'train.tsv', 'test.tsv'
return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, sep='\t')
def adult():
"""
Download "Adult Data Set" [1] from UCI Machine Learning Repository.
Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part
(adult.test) of the dataset.
[1]: https://archive.ics.uci.edu/ml/datasets/Adult
"""
# via https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
names = (
'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
'native-country', 'income', )
dtype = {
'age': float, 'workclass': object, 'fnlwgt': float, 'education': object,
'education-num': float, 'marital-status': object, 'occupation': object,
'relationship': object, 'race': object, 'sex': object, 'capital-gain': float,
'capital-loss': float, 'hours-per-week': float,
'native-country': object, 'income': object, }
# proxy.sandbox.yandex-team.ru is Yandex internal storage, we first try to download it from
# internal storage to avoid putting too much pressure on UCI storage from our internal CI
train_urls = (
'https://proxy.sandbox.yandex-team.ru/779118052',
'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', )
train_md5 = '5d7c39d7b8804f071cdd1f2a7c460872'
fd_train, train_path = tempfile.mkstemp()
os.close(fd_train)
_cached_download(train_urls, train_md5, train_path)
test_urls = (
'https://proxy.sandbox.yandex-team.ru/779120000',
'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', )
test_md5 = '35238206dfdf7f1fe215bbb874adecdc'
fd_test, test_path = tempfile.mkstemp()
os.close(fd_test)
_cached_download(test_urls, test_md5, test_path)
train_df = pd.read_csv(train_path, names=names, header=None, sep=r',\s*', na_values=['?'], engine='python')
os.remove(train_path)
# lines in test part end with dot, thus we need to fix last column of the dataset
test_df = pd.read_csv(test_path, names=names, header=None, sep=r',\s*', na_values=['?'], skiprows=1, converters={'income': lambda x: x[:-1]}, engine='python')
os.remove(test_path)
# pandas 0.19.1 doesn't support `dtype` parameter for `read_csv` when `python` engine is used,
# so we have to do the casting manually; also we can't use `converters` together with `dtype`
train_df = train_df.astype(dtype)
test_df = test_df.astype(dtype)
return train_df, test_df
def higgs():
"""
Download "higgs" [1] data set.
Will return two pandas.DataFrame-s, first with train part and second with
test part of the dataset. Object class will be located in the first
column of dataset.
[1]: https://archive.ics.uci.edu/ml/datasets/HIGGS
"""
url = 'https://storage.mds.yandex.net/get-devtools-opensource/250854/higgs.tar.gz'
md5 = 'ad59ba8328a9afa3837d7bf1a0e10e7b'
dataset_name, train_file, test_file = 'higgs', 'train.tsv', 'test.tsv'
train_path, test_path = _download_dataset(url, md5, dataset_name, train_file, test_file, cache=True)
return (
_load_numeric_only_dataset(train_path, 10500000, 29, sep='\t'),
_load_numeric_only_dataset(test_path, 500000, 29, sep='\t'))