from itertools import chain, compress
import numpy as np
import scipy.sparse as sp
from math import ceil, floor
import numbers
defcheck_random_state(seed):"""Turn seed into a np.random.RandomState instance
Parameters
----------
seed : None, int or instance of RandomState
If seed is None, return the RandomState singleton used by np.random.
If seed is an int, return a new RandomState instance seeded with seed.
If seed is already a RandomState instance, return it.
Otherwise raise ValueError.
"""if seed isNoneor seed is np.random:return np.random.mtrand._rand
ifisinstance(seed, numbers.Integral):return np.random.RandomState(seed)ifisinstance(seed, np.random.RandomState):return seed
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'' instance'% seed)classBaseShuffleSplit():"""Base class for ShuffleSplit and StratifiedShuffleSplit"""def__init__(self, n_splits=10,*, test_size=None, train_size=None,
random_state=None):
self.n_splits = n_splits
self.test_size = test_size
self.train_size = train_size
self.random_state = random_state
self._default_test_size =0.1defsplit(self, X, y=None, groups=None):"""Generate indices to split data into training and test set.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
y : array-like of shape (n_samples,)
The target variable for supervised learning problems.
groups : array-like of shape (n_samples,), default=None
Group labels for the samples used while splitting the dataset into
train/test set.
Yields
------
train : ndarray
The training set indices for that split.
test : ndarray
The testing set indices for that split.
Notes
-----
Randomized CV splitters may return different results for each call of
split. You can make the results identical by setting `random_state`
to an integer.
"""
X, y, groups = indexable(X, y, groups)for train, test in self._iter_indices(X, y, groups):yield train, test
def_iter_indices(self, X, y=None, groups=None):"""Generate (train, test) indices"""defget_n_splits(self, X=None, y=None, groups=None):"""Returns the number of splitting iterations in the cross-validator
Parameters
----------
X : object
Always ignored, exists for compatibility.
y : object
Always ignored, exists for compatibility.
groups : object
Always ignored, exists for compatibility.
Returns
-------
n_splits : int
Returns the number of splitting iterations in the cross-validator.
"""return self.n_splits
def__repr__(self):return _build_repr(self)classShuffleSplit(BaseShuffleSplit):"""Random permutation cross-validator
Yields indices to split data into training and test sets.
Note: contrary to other cross-validation strategies, random splits
do not guarantee that all folds will be different, although this is
still very likely for sizeable datasets.
Read more in the :ref:`User Guide <cross_validation>`.
Parameters
----------
n_splits : int, default=10
Number of re-shuffling & splitting iterations.
test_size : float or int, default=None
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. If ``train_size`` is also None, it will
be set to 0.1.
train_size : float or int, default=None
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
the value is automatically set to the complement of the test size.
random_state : int, RandomState instance or None, default=None
Controls the randomness of the training and testing indices produced.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
Examples
--------
# >>> import numpy as np
# >>> from sklearn.model_selection import ShuffleSplit
# >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])
# >>> y = np.array([1, 2, 1, 2, 1, 2])
# >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
# >>> rs.get_n_splits(X)
# 5
# >>> print(rs)
# ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None)
# >>> for train_index, test_index in rs.split(X):
# ... print("TRAIN:", train_index, "TEST:", test_index)
# TRAIN: [1 3 0 4] TEST: [5 2]
# TRAIN: [4 0 2 5] TEST: [1 3]
# TRAIN: [1 2 4 0] TEST: [3 5]
# TRAIN: [3 4 1 0] TEST: [5 2]
# TRAIN: [3 5 1 0] TEST: [2 4]
# >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,
# ... random_state=0)
# >>> for train_index, test_index in rs.split(X):
# ... print("TRAIN:", train_index, "TEST:", test_index)
TRAIN: [1 3 0] TEST: [5 2]
TRAIN: [4 0 2] TEST: [1 3]
TRAIN: [1 2 4] TEST: [3 5]
TRAIN: [3 4 1] TEST: [5 2]
TRAIN: [3 5 1] TEST: [2 4]
"""def__init__(self, n_splits=10,*, test_size=None, train_size=None,
random_state=None):super().__init__(
n_splits=n_splits,
test_size=test_size,
train_size=train_size,
random_state=random_state)
self._default_test_size =0.1def_iter_indices(self, X, y=None, groups=None):
n_samples = _num_samples(X)
n_train, n_test = _validate_shuffle_split(
n_samples, self.test_size, self.train_size,
default_test_size=self._default_test_size)
rng = check_random_state(self.random_state)for i inrange(self.n_splits):# random partition
permutation = rng.permutation(n_samples)
ind_test = permutation[:n_test]
ind_train = permutation[n_test:(n_test + n_train)]yield ind_train, ind_test
def_num_samples(x):"""Return number of samples in array-like x."""
message ='Expected sequence or array-like, got %s'%type(x)ifhasattr(x,'fit')andcallable(x.fit):# Don't get num_samples from an ensembles length!raise TypeError(message)ifnothasattr(x,'__len__')andnothasattr(x,'shape'):ifhasattr(x,'__array__'):
x = np.asarray(x)else:raise TypeError(message)ifhasattr(x,'shape')and x.shape isnotNone:iflen(x.shape)==0:raise TypeError("Singleton array %r cannot be considered"" a valid collection."% x)# Check that shape is returning an integer or default to len# Dask dataframes may not return numeric shape[0] valueifisinstance(x.shape[0], numbers.Integral):return x.shape[0]try:returnlen(x)except TypeError as type_error:raise TypeError(message)from type_error
defcheck_consistent_length(*arrays):"""Check that all arrays have consistent first dimensions.
Checks whether all objects in arrays have the same shape or length.
Parameters
----------
*arrays : list or tuple of input objects.
Objects that will be checked for consistent length.
"""
lengths =[_num_samples(X)for X in arrays if X isnotNone]
uniques = np.unique(lengths)iflen(uniques)>1:raise ValueError("Found input variables with inconsistent numbers of"" samples: %r"%[int(l)for l in lengths])def_make_indexable(iterable):"""Ensure iterable supports indexing or convert to an indexable variant.
Convert sparse matrices to csr and other non-indexable iterable to arrays.
Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
Parameters
----------
iterable : {list, dataframe, ndarray, sparse matrix} or None
Object to be converted to an indexable iterable.
"""if sp.issparse(iterable):return iterable.tocsr()elifhasattr(iterable,"__getitem__")orhasattr(iterable,"iloc"):return iterable
elif iterable isNone:return iterable
return np.array(iterable)defindexable(*iterables):"""Make arrays indexable for cross-validation.
Checks consistent length, passes through None, and ensures that everything
can be indexed by converting sparse matrices to csr and converting
non-interable objects to arrays.
Parameters
----------
*iterables : {lists, dataframes, ndarrays, sparse matrices}
List of objects to ensure sliceability.
"""
result =[_make_indexable(X)for X in iterables]
check_consistent_length(*result)return result
def_validate_shuffle_split(n_samples, test_size, train_size,
default_test_size=None):"""
Validation helper to check if the test/test sizes are meaningful wrt to the
size of the data (n_samples)
"""if test_size isNoneand train_size isNone:
test_size = default_test_size
# 获得数据类型
test_size_type = np.asarray(test_size).dtype.kind
train_size_type = np.asarray(train_size).dtype.kind
if(test_size_type =='i'and(test_size >= n_samples or test_size <=0)or test_size_type =='f'and(test_size <=0or test_size >=1)):raise ValueError('test_size={0} should be either positive and smaller'' than the number of samples {1} or a float in the ''(0, 1) range'.format(test_size, n_samples))if(train_size_type =='i'and(train_size >= n_samples or train_size <=0)or train_size_type =='f'and(train_size <=0or train_size >=1)):raise ValueError('train_size={0} should be either positive and smaller'' than the number of samples {1} or a float in the ''(0, 1) range'.format(train_size, n_samples))if train_size isnotNoneand train_size_type notin('i','f'):raise ValueError("Invalid value for train_size: {}".format(train_size))if test_size isnotNoneand test_size_type notin('i','f'):raise ValueError("Invalid value for test_size: {}".format(test_size))if(train_size_type =='f'and test_size_type =='f'and
train_size + test_size >1):raise ValueError('The sum of test_size and train_size = {}, should be in the (0, 1)'' range. Reduce test_size and/or train_size.'.format(train_size + test_size))if test_size_type =='f':
n_test = ceil(test_size * n_samples)elif test_size_type =='i':
n_test =float(test_size)if train_size_type =='f':
n_train = floor(train_size * n_samples)elif train_size_type =='i':
n_train =float(train_size)if train_size isNone:
n_train = n_samples - n_test
elif test_size isNone:
n_test = n_samples - n_train
if n_train + n_test > n_samples:raise ValueError('The sum of train_size and test_size = %d, ''should be smaller than the number of ''samples %d. Reduce test_size and/or ''train_size.'%(n_train + n_test, n_samples))
n_train, n_test =int(n_train),int(n_test)if n_train ==0:raise ValueError('With n_samples={}, test_size={} and train_size={}, the ''resulting train set will be empty. Adjust any of the ''aforementioned parameters.'.format(n_samples, test_size,
train_size))return n_train, n_test
def_list_indexing(X, key, key_dtype):"""Index a Python list."""if np.isscalar(key)orisinstance(key,slice):# key is a slice or a scalarreturn X[key]if key_dtype =='bool':# key is a boolean array-likereturnlist(compress(X, key))# key is a integer array-like of keyreturn[X[idx]for idx in key]def_determine_key_type(key, accept_slice=True):"""Determine the data type of key.
Parameters
----------
key : scalar, slice or array-like
The key from which we want to infer the data type.
accept_slice : bool, default=True
Whether or not to raise an error if the key is a slice.
Returns
-------
dtype : {'int', 'str', 'bool', None}
Returns the data type of key.
"""
err_msg =("No valid specification of the columns. Only a scalar, list or ""slice of all integers or all strings, or boolean mask is ""allowed")
dtype_to_str ={int:'int',str:'str',bool:'bool', np.bool_:'bool'}
array_dtype_to_str ={'i':'int','u':'int','b':'bool','O':'str','U':'str','S':'str'}if key isNone:returnNoneifisinstance(key,tuple(dtype_to_str.keys())):try:return dtype_to_str[type(key)]except KeyError:raise ValueError(err_msg)ifisinstance(key,slice):ifnot accept_slice:raise TypeError('Only array-like or scalar are supported. ''A Python slice was given.')if key.start isNoneand key.stop isNone:returnNone
key_start_type = _determine_key_type(key.start)
key_stop_type = _determine_key_type(key.stop)if key_start_type isnotNoneand key_stop_type isnotNone:if key_start_type != key_stop_type:raise ValueError(err_msg)if key_start_type isnotNone:return key_start_type
return key_stop_type
ifisinstance(key,(list,tuple)):
unique_key =set(key)
key_type ={_determine_key_type(elt)for elt in unique_key}ifnot key_type:returnNoneiflen(key_type)!=1:raise ValueError(err_msg)return key_type.pop()ifhasattr(key,'dtype'):try:return array_dtype_to_str[key.dtype.kind]except KeyError:raise ValueError(err_msg)raise ValueError(err_msg)def_safe_indexing(X, indices,*, axis=0):"""Return rows, items or columns of X using indices.
.. warning::
This utility is documented, but **private**. This means that
backward compatibility might be broken without any deprecation
cycle.
Parameters
----------
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
Data from which to sample rows, items or columns. `list` are only
supported when `axis=0`.
indices : bool, int, str, slice, array-like
- If `axis=0`, boolean and integer array-like, integer slice,
and scalar integer are supported.
- If `axis=1`:
- to select a single column, `indices` can be of `int` type for
all `X` types and `str` only for dataframe. The selected subset
will be 1D, unless `X` is a sparse matrix in which case it will
be 2D.
- to select multiples columns, `indices` can be one of the
following: `list`, `array`, `slice`. The type used in
these containers can be one of the following: `int`, 'bool' and
`str`. However, `str` is only supported when `X` is a dataframe.
The selected subset will be 2D.
axis : int, default=0
The axis along which `X` will be subsampled. `axis=0` will select
rows while `axis=1` will select columns.
Returns
-------
subset
Subset of X on axis 0 or 1.
Notes
-----
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
not supported.
"""if indices isNone:return X
if axis notin(0,1):raise ValueError("'axis' should be either 0 (to index rows) or 1 (to index "" column). Got {} instead.".format(axis))
indices_dtype = _determine_key_type(indices)if axis ==0and indices_dtype =='str':raise ValueError("String indexing is not supported with 'axis=0'")if axis ==1and X.ndim !=2:raise ValueError("'X' should be a 2D NumPy array, 2D sparse matrix or pandas ""dataframe when indexing the columns (i.e. 'axis=1'). ""Got {} instead with {} dimension(s).".format(type(X), X.ndim))if axis ==1and indices_dtype =='str'andnothasattr(X,'loc'):raise ValueError("Specifying the columns using strings is only supported for ""pandas DataFrames")return _list_indexing(X, indices, indices_dtype)deftrain_test_split(*arrays,
test_size=None,
train_size=None,
random_state=None,
shuffle=True,
stratify=None):
n_arrays =len(arrays)if n_arrays ==0:raise ValueError("At least one array required as input")# 已经判断都具有相同长度
arrays = indexable(*arrays)
n_samples = _num_samples(arrays[0])# 获得train和test合法数字
n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size,
default_test_size=0.25)if shuffle isFalse:if stratify isnotNone:raise ValueError("Stratified train/test split is not implemented for ""shuffle=False")
train = np.arange(n_train)
test = np.arange(n_train, n_train + n_test)else:
CVClass = ShuffleSplit
cv = CVClass(test_size=n_test,
train_size=n_train,
random_state=random_state)
train, test =next(cv.split(X=arrays[0], y=stratify))returnlist(chain.from_iterable((_safe_indexing(a, train), _safe_indexing(a, test))for a in arrays))
X, y = np.arange(10).reshape((5,2)),range(5)
train_test_split(X, y)