常见方法:降采样。
def get_downsample_data(df, neg_pos_scale):
:param df: 需要进行降采样的DataFrame
:param neg_pos_scale: 负样本数量/正样本数量
:return: 降采样之后的DataFrame
pos = df[df[label]==1].reset_index(drop=True)
neg = df[df[label]==0].reset_index(drop=True)
print('original pos_neg_scale = 1:%.2f, now updated to 1:%.2f' %(neg.shape[0]/pos.shape[0], neg_pos_scale))
rate = pos.shape[0]*neg_pos_scale/neg.shape[0]
neg = neg.sample(frac=rate)
data = pd.concat([pos, neg], axis=0).reset_index(drop=True)
return data