原理
- 采用KNN算法,计算出每个少数类样本的K个邻近;
- 从K个邻近中随机挑选N个样本进行随机线性插值;
- 构造新的少数类样本;
- 将新样本放入原数据,产生新的训练集;
实现
from sklearn.neighbors import NearestNeighbors
import numpy as np
import random
class Smote:
def __init__(self, samples, N=10,k=5):
self.n_samples, self.n_attrs=samples.shape
self.N=N
self.k=k
self.samples=samples
def over_sampling(self):
if self.N<100:
old_n_samples=self.n_samples
print "old_n_samples", old_n_samples
self.n_samples=int(float(self.N)/100*old_n_samples)
print "n_samples", self.n_samples
keep=np.random.permutation(old_n_samples)[:self.n_samples]
print "keep", keep
new_samples=self.samples[keep]
print "new_samples", new_samples
self.samples=new_samples
print "self.samples", self.samples
self.N=100
N=int(self.N/100)
self.synthetic=np.zeros((self.n_samples*N, self.n_attrs))
self.new_index=0
neighbors=NearestNeighbors(n_neighbors=self.k).fit(self.samples)
print "neighbors", neighbors
for i in range(len(self.samples)):
nnarray=neighbors.kneighbors(self.samples[i],return_distance=False)[0]
self.__populate(N, i, nnarray )
return self.synthetic
def __populate(self, N, i, nnarray):
for i in range(N):
nn = np.random.randint(0, self.k)
dif=self.samples[nnarray[nn]]-self.samples[i]
gap=np.random.rand(1,self.n_attrs)
self.synthetic[self.new_index]=self.samples[i]+gap.flatten()*dif
self.new_index+=1