MLPproject/.venv/lib/python3.12/site-packages/catboost/datasets.py

import hashlib
import logging
import numpy as np
import os
import pandas as pd
import tarfile
import tempfile
import six
import shutil

from .core import PATH_TYPES, fspath


logger = logging.getLogger(__name__)


def _extract(src_file, dst_dir='.'):
    cur_dir = os.getcwd()
    os.chdir(dst_dir)
    try:
        with tarfile.open(src_file, 'r:gz') as f:
            f.extractall()
    finally:
        os.chdir(cur_dir)


def _calc_md5(path):
    hasher = hashlib.md5()
    with open(path, 'rb') as f:
        while True:
            block = f.read(65536)
            if not block:
                break
            hasher.update(block)
    return hasher.hexdigest()


def _ensure_dir_exists(path):
    try:
        os.makedirs(path)
    except OSError:
        if not os.path.isdir(path):
            raise


def _cached_download(url, md5, dst):
    if os.path.isfile(dst) and _calc_md5(dst) == md5:
        return

    def reporthook(blocknum, bs, size):
        logger.debug('downloaded %s bytes', size)

    urls = url if isinstance(url, list) or isinstance(url, tuple) else (url, )

    for u in urls:
        try:
            six.moves.urllib.request.urlretrieve(u, dst, reporthook=reporthook)
            break
        except (six.moves.urllib.error.URLError, IOError):
            logger.debug('failed to download from %s', u)
    else:
        raise RuntimeError('failed to download from %s', urls)

    dst_md5 = _calc_md5(dst)
    if dst_md5 != md5:
        raise RuntimeError('md5 sum mismatch for url {url}; expected {expected}, but got {got}'.format(
            url=u, expected=md5, got=dst_md5))


_cache_path = None


def _get_cache_path():
    global _cache_path
    if _cache_path is None:
        _cache_path = os.path.join(os.getcwd(), 'catboost_cached_datasets')
    return _cache_path


def set_cache_path(path):
    assert isinstance(path, PATH_TYPES), 'expected string or pathlib.Path'
    global _cache_path
    _cache_path = fspath(path)


def _download_dataset(url, md5, dataset_name, train_file, test_file, cache=False):
    # TODO(yazevnul): this is not thread safe (or process safe?), we should take a file lock when
    # enter this function to avoid dataset being overwritten or corrupted or something else that may
    # have happen when OS operated simultaneously on the same file. Same thing should probably be
    # done with `_cached_download`.
    dir_path = os.path.join(_get_cache_path(), dataset_name) if cache else tempfile.mkdtemp()
    train_path = os.path.join(dir_path, train_file)
    test_path = os.path.join(dir_path, test_file)
    if not (os.path.exists(train_path) and os.path.exists(test_path)):
        _ensure_dir_exists(dir_path)
        file_descriptor, file_path = tempfile.mkstemp()
        os.close(file_descriptor)
        try:
            _cached_download(url, md5, file_path)
            _extract(file_path, dir_path)
        finally:
            os.remove(file_path)
    # move files for safe delete of temp dir
    if not cache:
        fd_new_train, new_train_path = tempfile.mkstemp()
        fd_new_test, new_test_path = tempfile.mkstemp()
        os.close(fd_new_train)
        os.close(fd_new_test)
        os.replace(train_path, new_train_path)
        os.replace(test_path, new_test_path)
        shutil.rmtree(dir_path)
        train_path, test_path = new_train_path, new_test_path
    return train_path, test_path


def _load_dataset_pd(url, md5, dataset_name, train_file, test_file, sep=',', header='infer', cache=False):
    train_path, test_path = _download_dataset(url, md5, dataset_name, train_file, test_file, cache)
    train, test = pd.read_csv(train_path, header=header, sep=sep), pd.read_csv(test_path, header=header, sep=sep)
    if not cache:
        os.remove(train_path)
        os.remove(test_path)
    return train, test


def _load_numeric_only_dataset(path, row_count, column_count, sep='\t'):
    # - can't use `pandas.read_csv` because it may result in 5x overhead
    # - can't use `numpy.loadtxt` because it may result in 3x overhead
    # And both mentioned above solutions are very slow compared to the one implemented below.
    dataset = np.zeros((row_count, column_count, ), dtype=np.float32, order='F')
    with open(path, 'rb') as f:
        for line_idx, line in enumerate(f):
            # `str.split()` is too slow, use `numpy.fromstring()`
            row = np.fromstring(line, dtype=np.float32, sep=sep)
            assert row.size == column_count, 'got too many columns at line %d (expected %d columns, got %d)' % (line_idx + 1, column_count, row.size)
            # doing `dataset[line_idx][:]` instead of `dataset[line_idx]` is here on purpose,
            # otherwise we may reallocate memory, while here we just copy
            dataset[line_idx][:] = row

    assert line_idx + 1 == row_count, 'got too many lines (expected %d lines, got %d)' % (row_count, line_idx + 1)

    return pd.DataFrame(dataset)


def titanic():
    url = 'https://storage.mds.yandex.net/get-devtools-opensource/233854/titanic.tar.gz'
    md5 = '9c8bc61d545c6af244a1d37494df3fc3'
    dataset_name, train_file, test_file = 'titanic', 'train.csv', 'test.csv'
    return _load_dataset_pd(url, md5, dataset_name, train_file, test_file)


def amazon():
    url = 'https://storage.mds.yandex.net/get-devtools-opensource/250854/amazon.tar.gz'
    md5 = '8fe3eec12bfd9c4c532b24a181d0aa2c'
    dataset_name, train_file, test_file = 'amazon', 'train.csv', 'test.csv'
    return _load_dataset_pd(url, md5, dataset_name, train_file, test_file)


def msrank():
    url = 'https://storage.mds.yandex.net/get-devtools-opensource/233854/msrank.tar.gz'
    md5 = '34fee225d02419adc106581f4eb36f2e'
    dataset_name, train_file, test_file = 'msrank', 'train.tsv', 'test.tsv'
    return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, header=None, sep='\t', cache=True)


def msrank_10k():
    url = 'https://storage.mds.yandex.net/get-devtools-opensource/250854/msrank_10k.tar.gz'
    md5 = '79c5b67397289c4c8b367c1f34629eae'
    dataset_name, train_file, test_file = 'msrank_10k', 'train.csv', 'test.csv'
    return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, header=None)


def rotten_tomatoes():
    """
    Contains information from kaggle [1], which is made available here under the Open Database License (ODbL) [2].

    Download "rotten_tomatoes" [1] data set.

    Will return two pandas.DataFrame-s, first with train part (rotten_tomatoes.data) and second with test part
    (rotten_tomatoes.test) of the dataset.

    NOTE: This is a preprocessed version of the dataset.

    [1]: https://www.kaggle.com/rpnuser8182/rotten-tomatoes
    [2]: https://opendatacommons.org/licenses/odbl/1-0/index.html
    """
    url = 'https://catboost-opensource.s3.yandex.net/rotten_tomatoes.tar.gz'
    md5 = 'a07fed612805ac9e17ced0d82a96add4'
    dataset_name, train_file, test_file = 'rotten_tomatoes', 'learn.tsv', 'test.tsv'
    return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, sep='\t')


def imdb():
    url = 'https://catboost-opensource.s3.yandex.net/imdb.tar.gz'
    md5 = '0fd62578d631ac3d71a71c3e6ced6f8b'
    dataset_name, train_file, test_file = 'imdb', 'learn.tsv', 'test.tsv'
    return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, sep='\t')


def epsilon():
    """
    Download "epsilon" [1] data set.

    Will return two pandas.DataFrame-s, first with train part (epsilon_normalized) and second with
    test part (epsilon_normalized.t) of the dataset. Object class will be located in the first
    column of dataset.

    NOTE: This is a preprocessed version of the dataset. It was converted from libsvm format into
    tsv (CatBoost doesn't support libsvm format out of the box).

    [1]: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#epsilon
    """
    urls = (
        'https://proxy.sandbox.yandex-team.ru/785711439',
        'https://storage.mds.yandex.net/get-devtools-opensource/250854/epsilon.tar.gz', )
    md5 = '5bbfac403ac673da7d7ee84bd532e973'
    dataset_name, train_file, test_file = 'epsilon', 'train.tsv', 'test.tsv'
    train_path, test_path = _download_dataset(urls, md5, dataset_name, train_file, test_file, cache=True)
    return (
        _load_numeric_only_dataset(train_path, 400000, 2001, sep='\t'),
        _load_numeric_only_dataset(test_path, 100000, 2001, sep='\t'))


def monotonic1():
    """
    Dataset with monotonic constraints.
    Can be used for poisson regression.
    Has several numerical and several categorical features.
    The first column contains target values. Columns with names Cat* contain categorical features.
    Columns with names Num* contain numerical features.

    Dataset also contains several numerical features, for which monotonic constraints must hold.
    For features in columns named MonotonicNeg*, if feature value decreases, then prediction value must not decrease.
    Thus, if there are two samples x1, x2 with all features being equal except
    for a monotonic negative feature M, such that x1[M] > x2[M], then the following inequality must
    hold for predictions: f(x1) <= f(x2)
    """
    url = 'https://storage.mds.yandex.net/get-devtools-opensource/479623/monotonic1.tar.gz'
    md5 = '1b9d8e15bc3fd6f1498e652e7fc4f4ca'
    dataset_name, train_file, test_file = 'monotonic1', 'train.tsv', 'test.tsv'
    return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, sep='\t', cache=True)


def monotonic2():
    """
    Dataset with monotonic constraints.
    Can be used for regression.
    The first column contains target values.
    Other columns contain contain numerical features, for which monotonic constraints must hold.

    For features in columns named MonotonicNeg*, if feature value decreases, then prediction
    value must not decrease. Thus, if there are two samples x1, x2 with all features being
    equal except for a monotonic negative feature MNeg, such that x1[MNeg] > x2[MNeg], then
    the following inequality must hold for predictions: f(x1) <= f(x2)
    For features in columns named MonotonicPos*, if feature value decreases, then prediction
    value must not increase. Thus, if there are two samples x1, x2 with all features being
    equal except for a monotonic positive feature MPos, such that x1[MPos] > x2[MPos],
    then the following inequality must hold for predictions: f(x1) >= f(x2)
    """
    url = 'https://storage.mds.yandex.net/get-devtools-opensource/250854/monotonic2.tar.gz'
    md5 = 'ce559e212cb72c156269f6f9a641baca'
    dataset_name, train_file, test_file = 'monotonic2', 'train.tsv', 'test.tsv'
    return _load_dataset_pd(url, md5, dataset_name, train_file, test_file, sep='\t')


def adult():
    """
    Download "Adult Data Set" [1] from UCI Machine Learning Repository.

    Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part
    (adult.test) of the dataset.

    [1]: https://archive.ics.uci.edu/ml/datasets/Adult
    """
    # via https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
    names = (
        'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
        'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
        'native-country', 'income', )
    dtype = {
        'age': float, 'workclass': object, 'fnlwgt': float, 'education': object,
        'education-num': float, 'marital-status': object, 'occupation': object,
        'relationship': object, 'race': object, 'sex': object, 'capital-gain': float,
        'capital-loss': float, 'hours-per-week': float,
        'native-country': object, 'income': object, }

    # proxy.sandbox.yandex-team.ru is Yandex internal storage, we first try to download it from
    # internal storage to avoid putting too much pressure on UCI storage from our internal CI

    train_urls = (
        'https://proxy.sandbox.yandex-team.ru/779118052',
        'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', )
    train_md5 = '5d7c39d7b8804f071cdd1f2a7c460872'
    fd_train, train_path = tempfile.mkstemp()
    os.close(fd_train)
    _cached_download(train_urls, train_md5, train_path)

    test_urls = (
        'https://proxy.sandbox.yandex-team.ru/779120000',
        'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', )
    test_md5 = '35238206dfdf7f1fe215bbb874adecdc'
    fd_test, test_path = tempfile.mkstemp()
    os.close(fd_test)
    _cached_download(test_urls, test_md5, test_path)

    train_df = pd.read_csv(train_path, names=names, header=None, sep=r',\s*', na_values=['?'], engine='python')
    os.remove(train_path)

    # lines in test part end with dot, thus we need to fix last column of the dataset
    test_df = pd.read_csv(test_path, names=names, header=None, sep=r',\s*', na_values=['?'], skiprows=1, converters={'income': lambda x: x[:-1]}, engine='python')
    os.remove(test_path)

    # pandas 0.19.1 doesn't support `dtype` parameter for `read_csv` when `python` engine is used,
    # so we have to do the casting manually; also we can't use `converters` together with `dtype`
    train_df = train_df.astype(dtype)
    test_df = test_df.astype(dtype)

    return train_df, test_df


def higgs():
    """
    Download "higgs" [1] data set.

    Will return two pandas.DataFrame-s, first with train part and second with
    test part of the dataset. Object class will be located in the first
    column of dataset.

    [1]: https://archive.ics.uci.edu/ml/datasets/HIGGS
    """
    url = 'https://storage.mds.yandex.net/get-devtools-opensource/250854/higgs.tar.gz'
    md5 = 'ad59ba8328a9afa3837d7bf1a0e10e7b'
    dataset_name, train_file, test_file = 'higgs', 'train.tsv', 'test.tsv'
    train_path, test_path = _download_dataset(url, md5, dataset_name, train_file, test_file, cache=True)
    return (
        _load_numeric_only_dataset(train_path, 10500000, 29, sep='\t'),
        _load_numeric_only_dataset(test_path, 500000, 29, sep='\t'))