python pandas实现下采样

最新推荐文章于 2024-08-04 20:27:41 发布

frostjsy

最新推荐文章于 2024-08-04 20:27:41 发布

阅读量7.2k

点赞数 3

分类专栏： python 文章标签： python pandas数据抽样

本文链接：https://blog.csdn.net/u013069552/article/details/88935644

版权

python 专栏收录该内容

39 篇文章 2 订阅

订阅专栏

参考网址：

pandas sample解释网址：https://blog.csdn.net/shiheyingzhe/article/details/81835821

import numpy as np
import pandas as pd
 
#通过numpy随机选取多数样本的采样下标
def lower_sample_data(df, percent=1):
    '''
    percent:多数类别下采样的数量相对于少数类别样本数量的比例
    '''
    most_data = df[df['label'] == 1]  # 多数类别的样本
    minority_data = df[df['label'] == 0]  # 少数类别的样本
    index = np.random.randint(len(most_data), size=int(percent *len(minority_data)) )
    #下采样后数据样本
    lower_data = most_data.iloc[list(index)]  # 下采样
    return(pd.concat([lower_data, minority_data]))

#通过pandas的sample函数实现下采样
def lower_sample_data_by_sample(df,percent=1):
    most_data = df[df['label'] == 1]  # 多数类别的样本
    minority_data = df[df['label'] == 0]  # 少数类别的样本   
    #随机采样most_data中的数据
 lower_data=most_data.sample(n=int(percent*len(minority_data)),replace=False,random_state=0,axis=0)   
    return (pd.concat([lower_data,minority_data]))
    

if __name__=='__main__':
    
    #设置随机种子
    np.random.seed(0)
    
    #随机生成样本，arr1维度100*6，值在1-10之间
    arr1=np.random.randint(1,10,size=(100,6))
    arr2=np.random.randint(20,50,size=(5,6))
    
    #将上面的数用pandas存储
    df1 = pd.DataFrame(arr1, columns=list('abcdef'))
    df1['label'] = 1
    df2 = pd.DataFrame(arr2, columns=list('abcdef'))
    df2['label'] = 0
    
    #合并arr1和arr2
    df = pd.concat([df1, df2])
    
    #下采样方式一，调用函数lower_sample_data
    print(lower_sample_data(df))
    
    #下采样方式二，通过sample进行采样
    print(lower_sample_data_by_sample(df))