SMOTE原理及实现

最新推荐文章于 2024-07-15 18:42:33 发布

panda_zjd

最新推荐文章于 2024-07-15 18:42:33 发布

阅读量5.7k

点赞数 1

分类专栏：算法数据预处理文章标签： smote算法过采样

本文链接：https://blog.csdn.net/panda_zjd/article/details/79200493

版权

算法同时被 2 个专栏收录

9 篇文章 1 订阅

订阅专栏

数据预处理

4 篇文章 1 订阅

订阅专栏

Smote算法原理：
这里写图片描述
python2.7 算法实现：原算法只能针对N采样率小于100%或者N为100%整数的参数进行采样。我实现的代码可对任意N>0的采样率从进行SMOTE。详情见源码

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
from sklearn.neighbors import NearestNeighbors
import numpy as np
import warnings
from sklearn.datasets import load_iris
warnings.filterwarnings("ignore")

class Smote(object):
    """data为少数类"""
    def __init__(self,data,N=100,k=5):
        self.data = data
        self.N = N 
        self.k = k+1
        self.n_attrs = data.shape[1]

    def oversample(self):

        # 1.获取要插值的对象 sample 
        index = set()          #当插值的倍率<1时 存储用来存储smote的样本序列 
        sample = list()        #用来存储进行smote的样本
        newsample = list()     #用来存储插值的样本

        N_inter = int(self.N/100)            #N大于100%的部分（整数的采样倍数）
        N_decimal = self.N- 100*N_inter      #N小于100%的部分

        #N小于100的部分 随机选择N*len(data)进行插值
        while(len(index)<int(len(self.data)*N_decimal/100.0)):
            term = np.random.randint(len(self.data))
            if term in index:
                continue
            else:
                index.add(term)
                sample.append(self.data[term])

        #N大于100%的部分 （整数的采样倍数）        
        if N_inter>0:
                for i in xrange(N_inter):
                    sample.extend(self.data)  
        #2进行插值
        knn = NearestNeighbors(n_neighbors=self.k).fit(self.data)
        for j in sample:
            nnarray = knn.kneighbors(j, return_distance=False)[0][1:] #选择少数类中最近的K个样本 【1：】表示去除自己
            #根据公式  xnew = x +rand(0,1)*(x近邻-x) 计算插值样本 并将其存储在newsample中
            select = np.random.choice(nnarray)
            dif = self.data[select] - j
            gap = np.random.rand(1,self.n_attrs)
            newsample.append(j + gap.flatten() * dif)

        return newsample

if __name__ == "__main__":
    #随机生成100个高斯分布数据
    iris = load_iris()
    data = iris.data
    i_label = iris.target
    data = np.random.randn(100,2)
    i_label = np.ones(100)
    i_label[0:50] = 0
    data = np.column_stack([data,i_label])
    #初始化smote的参数
    K= 3
    N =120
    smote = Smote(data=data[:50],N=N,k=K)
    new = smote.oversample()    #new 为新生成的样本
    print len(new)