参考网址:
pandas sample解释网址:https://blog.csdn.net/shiheyingzhe/article/details/81835821
import numpy as np
import pandas as pd
#通过numpy随机选取多数样本的采样下标
def lower_sample_data(df, percent=1):
'''
percent:多数类别下采样的数量相对于少数类别样本数量的比例
'''
most_data = df[df['label'] == 1] # 多数类别的样本
minority_data = df[df['label'] == 0] # 少数类别的样本
index = np.random.randint(len(most_data), size=int(percent *len(minority_data)) )
#下采样后数据样本
lower_data = most_data.iloc[list(index)] # 下采样
return(pd.concat([lower_data, minority_data]))
#通过pandas的sample函数实现下采样
def lower_sample_data_by_sample(df,percent=1):
most_data = df[df['label'] == 1] # 多数类别的样本
minority_data = df[df['label'] == 0] # 少数类别的样本
#随机采样most_data中的数据
lower_data=most_data.sample(n=int(percent*len(minority_data)),replace=False,random_state=0,axis=0)
return (pd.concat([lower_data,minority_data]))
if __name__=='__main__':
#设置随机种子
np.random.seed(0)
#随机生成样本,arr1维度100*6,值在1-10之间
arr1=np.random.randint(1,10,size=(100,6))
arr2=np.random.randint(20,50,size=(5,6))
#将上面的数用pandas存储
df1 = pd.DataFrame(arr1, columns=list('abcdef'))
df1['label'] = 1
df2 = pd.DataFrame(arr2, columns=list('abcdef'))
df2['label'] = 0
#合并arr1和arr2
df = pd.concat([df1, df2])
#下采样方式一,调用函数lower_sample_data
print(lower_sample_data(df))
#下采样方式二,通过sample进行采样
print(lower_sample_data_by_sample(df))