数据不平衡:下采样、上采样python代码实现

import numpy as np
import pandas as pd

#下采样
def lower_sample_data(df, percent=1):
‘’’
percent:多数类别下采样的数量相对于少数类别样本数量的比例
‘’’
data1 = df[df[‘Label’] == 0] # 将多数量的类别样本放在data1
data0 = df[df[‘Label’] == 1] # 将少数量的类别样本放在data0
index = np.random.randint(
len(data1), size=percent * (len(df) - len(data1))) # 随机给定下采样取出样本的序号
lower_data1 = data1.iloc[list(index)] # 下采样
return(pd.concat([lower_data1, data0]))

#上采样
def up_sample_data(df, percent=0.2):
‘’’
percent:少数类别样本数量的重采样的比例,可控制,一般不超过0.5,以免过拟合
‘’’
data1 = df[df[‘Label’] == 0] # 将多数类别的样本放在data1
data0 = df[df[‘Label’] == 1] # 将少数类别的样本放在data0
index = np.random.randint(
len(data0), size= int(percent * (len(df) - len(data0)))) # 随机给定上采样取出样本的序号
up_data0 = data0.iloc[list(index)] # 上采样
return(pd.concat([up_data0, data1]))

#下下混合采样
def up_lower_sample_data(df, up_percent=0.2,lower_percent=0.5):
‘’’
percent:多数类别下采样的数量相对于少数类别样本数量的比例
‘’’
data1 = df[df[‘Label’] == 0] # 将多数类别的样本放在data1
data0 = df[df[‘Label’] == 1] # 将少数类别的样本放在data0
up_index = np.random.randint(
len(data0), size= int(up_percent * (len(df) - len(data0)))) # 随机给定上采样取出样本的序号
up_data0 = data0.iloc[list(up_index)] # 上采样
lower_index = np.random.randint(
len(data1), size=int(lower_percent * (len(df) - len(data0)))) # 随机给定下采样取出样本的序号
lower_data1 = data1.iloc[list(lower_index)] # 下采样
return(pd.concat([up_data0, lower_data1]))

np.random.seed(27)
arr1 = np.random.randint(6, size=(100, 5))
arr2 = np.random.randint(1000, 1010, size=(10, 5))
columns = [‘A’, ‘B’, ‘C’, ‘D’, ‘E’]
df1 = pd.DataFrame(arr1, columns=columns)
df1[‘Label’] = 0
df2 = pd.DataFrame(arr2, columns=columns)
df2[‘Label’] = 1
df = pd.concat([df1, df2])
#print(lower_sample_data(df))
dArry = lower_sample_data(df).values #转为numpy.ndarray’
print(dArry)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值