想使用少数类样本和边界多数类样本的集合训练模型。但是代码不知道怎么写,于是参考了imblearn库的borderline-smote源代码及https://www.cnblogs.com/zywnnblog/p/15332826.html
BorderlineSMOTE,这个类可以在使用SMOTE时传入Boardline参数时被调用,或者直接调用,这个类中最核心的函数就是_sample方法,_sample做了两件事:
第一件事和SMOTE中的类似,检查KNN分类器是否被串用,还有就是检查用户使用的是borderline-1还是borderline-2算法。
第二件事就是生成新样本的逻辑,其中插值的逻辑同样调用BaseSMOTE的_make_samples方法,Borderline和原始SMOTE不一样的地方就在于需要把样本划分为safe和danger(这个方法同样在BaseSMOTE中实现)选取种子样本,再按照borderline-1/2中不同的策略生成新样本。
详细逻辑见代码中的注释。
class BorderlineSMOTE(BaseSMOTE):
"""Over-sampling using Borderline SMOTE.
This algorithm is a variant of the original SMOTE algorithm proposed in
[2]_. Borderline samples will be detected and used to generate new
synthetic samples.
Read more in the :ref:`User Guide <smote_adasyn>`.
.. versionadded:: 0.4
Parameters
----------
{sampling_strategy}
{random_state}
k_neighbors : int or object, default=5
If ``int``, number of nearest neighbours to used to construct synthetic
samples. If object, an estimator that inherits from
:class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
find the k_neighbors.
{n_jobs}
m_neighbors : int or object, default=10
If int, number of nearest neighbours to use to determine if a minority
sample is in danger. If object, an estimator that inherits
from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used
to find the m_neighbors.
kind : {{"borderline-1", "borderline-2"}}, default='borderline-1'
The type of SMOTE algorithm to use one of the following options:
``'borderline-1'``, ``'borderline-2'``.
Attributes
----------
sampling_strategy_ : dict
Dictionary containing the information to sample the dataset. The keys
corresponds to the class labels from which to sample and the values
are the number of samples to sample.
nn_k_ : estimator object
Validated k-nearest neighbours created from the `k_neighbors` parameter.
nn_m_ : estimator object
Validated m-nearest neighbours created from the `m_neighbors` parameter.
n_features_in_ : int
Number of features in the input dataset.
.. versionadded:: 0.9
See Also
--------
SMOTE : Over-sample using SMOTE.
SMOTENC : Over-sample using SMOTE for continuous and categorical features.
SVMSMOTE : Over-sample using SVM-SMOTE variant.
ADASYN : Over-sample using ADASYN.
KMeansSMOTE : Over-sample applying a clustering before to oversample using
SMOTE.
Notes
-----
See the original papers: [2]_ for more details.
Supports multi-class resampling. A one-vs.-rest scheme is used as
originally proposed in [1]_.
References
----------
.. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
synthetic minority over-sampling technique," Journal of artificial
intelligence research, 321-357, 2002.
.. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
over-sampling method in imbalanced data sets learning," Advances in
intelligent computing, 878-887, 2005.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import make_classification
>>> from imblearn.over_sampling import \
BorderlineSMOTE # doctest: +NORMALIZE_WHITESPACE
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
>>> print('Original dataset shape %s' % Counter(y))
Original dataset shape Counter({{1: 900, 0: 100}})
>>> sm = BorderlineSMOTE(random_state=42)
>>> X_res, y_res = sm.fit_resample(X, y)
>>> print('Resampled dataset shape %s' % Counter(y_res))
Resampled dataset shape Counter({{0: 900, 1: 900}})
"""
@_deprecate_positional_args
def __init__(
self,
*,
sampling_strategy="auto",
random_state=None,
k_neighbors=5,
n_jobs=None,
m_neighbors=10,
kind="borderline-1",
):
super().__init__(
sampling_strategy=sampling_strategy,
random_state=random_state,
k_neighbors=k_neighbors,
n_jobs=n_jobs,
)
self.m_neighbors = m_neighbors
self.kind = kind
def _validate_estimator(self):
super()._validate_estimator()
self.nn_m_ = check_neighbors_object(
"m_neighbors", self.m_neighbors, additional_neighbor=1
)
self.nn_m_.set_params(**{"n_jobs": self.n_jobs})
if self.kind not in ("borderline-1", "borderline-2"):
raise ValueError(
f'The possible "kind" of algorithm are '
f'"borderline-1" and "borderline-2".'
f"Got {self.kind} instead."
)
def _fit_resample(self, X, y):
self._validate_estimator()
#将全部数据复制了一份
X_resampled = X.copy()
y_resampled = y.copy()
for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
continue #0类为多数类,跳过
#得到少数类样本索引
target_class_indices = np.flatnonzero(y == class_sample)
#得到少数类样本数据列表
X_class = _safe_indexing(X, target_class_indices)
#训练knn模型,用来找到危险样本(边界),默认近邻k参数为10
self.nn_m_.fit(X)
#找到危险样本,即近邻多数类样本数量大于一半,kind参数=‘noise’则为查找噪声。
danger_index = self._in_danger_noise(
self.nn_m_, X_class, class_sample, y, kind="danger"
)
if not any(danger_index):
continue
#训练少数类样本KNN模型
self.nn_k_.fit(X_class)
#得到危险样本的近邻数据列表
nns = self.nn_k_.kneighbors(
_safe_indexing(X_class, danger_index), return_distance=False
)[:, 1:]
# divergence between borderline-1 and borderline-2
#borderline-1 采样做插值的近邻只属于少数类
if self.kind == "borderline-1":
# Create synthetic samples for borderline points.
X_new, y_new = self._make_samples(
_safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
X_class,
nns,
n_samples,
)
if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
else:
X_resampled = np.vstack((X_resampled, X_new))
y_resampled = np.hstack((y_resampled, y_new))
#borderline-2 采样做插值的近邻可能属于任何一个类
elif self.kind == "borderline-2":
random_state = check_random_state(self.random_state)
fractions = random_state.beta(10, 10)
# only minority
X_new_1, y_new_1 = self._make_samples(
_safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
X_class,
nns,
int(fractions * (n_samples + 1)),
step_size=1.0,
)
# we use a one-vs-rest policy to handle the multiclass in which
# new samples will be created considering not only the majority
# class but all over classes.
X_new_2, y_new_2 = self._make_samples(
_safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
_safe_indexing(X, np.flatnonzero(y != class_sample)),
nns,
int((1 - fractions) * n_samples),
step_size=0.5,
)
if sparse.issparse(X_resampled):
X_resampled = sparse.vstack([X_resampled, X_new_1, X_new_2])
else:
X_resampled = np.vstack((X_resampled, X_new_1, X_new_2))
y_resampled = np.hstack((y_resampled, y_new_1, y_new_2))
return X_resampled, y_resampled
修改后:
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_random_state
from sklearn.utils import _safe_indexing
from imblearn.utils import check_neighbors_object
def _in_danger_noise(self, nn_estimator, samples, target_class, y, kind="danger"):
x = nn_estimator.kneighbors(samples, return_distance=False)[:, 1:]
nn_label = (y[x] != target_class).astype(int)#ture == 1,false==0
n_min = np.sum(nn_label, axis=1)
if kind == "danger":
# Samples are in danger for m/2 <= m' < m,(nn_estimator.n_neighbors - 1) / 1
return np.bitwise_and(
n_min >= (nn_estimator.n_neighbors - 1) / 2,
n_min < nn_estimator.n_neighbors - 1,
)
elif kind == "noise":
# Samples are noise for m = m'
return n_min == nn_estimator.n_neighbors - 1
else:
raise NotImplementedError
def borderline_sampling(self,X, y):
X_sampled = X[y==1]
y_sampled = y[y==1]
self.m_neighbors=10
self.nn_m_ = check_neighbors_object(
"m_neighbors", self.m_neighbors, additional_neighbor=1
)
target_class_indices = np.where(y == 0)[0]
X_class = _safe_indexing(X, target_class_indices)
self.nn_m_.fit(X)
danger_index = self._in_danger_noise(
self.nn_m_, X_class, 0, y, kind="danger"
)
X_danger = _safe_indexing(X_class,danger_index)
# nns = self.nn_k_.kneighbors(
# _safe_indexing(X_class, danger_index), return_distance=False
# )[:, 1:]
new_y = np.full(X_danger.shape[0], 0)
X_sampled = np.vstack((X_sampled,X_danger))
y_sampled = np.hstack((y_sampled,new_y))
return X_sampled,y_sampled