数据挖掘中有时候会遇到不均衡样本,一种常用的方法是通过根据样本个数进行统计计算,或者根据业务场景人工计算权重。诸如sklearn的DT、RF等模型都设置了class_weight参数进行快速balanced;而XGB的fit中也有sampe_weight参数,只是需要手动提供每个样本对应权重的array,这里提供了一段简单的计算样本权重的代码。
"""
根据labels序列进行统计,生成对应的sample权重
或者根据指定的权重字典,生成对应的sample权重
"""
import numpy as np
def sample_weights_cal(labels, flag=0, class_weights=None):
"""
:param labels: 样本label
:param flag: 0-返回权重字典,1-返回labels同维的权重序列
:return: 权重字典或权重序列
"""
try:
labels = np.array(labels)
except TypeError:
print("输入格式错误")
assert labels.ndim == 1
labels_set = set(labels)
labels_value = list(labels_set)
labels_count = np.array([(labels==l).sum() for l in labels_value])
sample_ratio = labels_count.prod()/labels_count
sample_ratio_std = sample_ratio / sample_ratio.sum()
sample_weights_dict = {i: round(j, 4) for i, j in zip(labels_value, sample_ratio_std)}
if not class_weights:
if flag == 0:
return sample_weights_dict
else:
sample_weights_info = np.zeros(shape=labels.shape)
for i in range(sample_weights_info.shape[0]):
sample_weights_info[i] = sample_weights_dict.get(labels[i])
return sample_weights_info
else:
assert isinstance(class_weights, dict) # 必须为字典
assert flag==1 # 必须返回序列
assert labels_set == set(class_weights.keys())
class_weights_sum = np.array(list(class_weights.values())).sum()
print(class_weights_sum)
for key, value in class_weights.items():
class_weights[key] = value/class_weights_sum
sample_weights_info = np.zeros(shape=labels.shape)
for i in range(sample_weights_info.shape[0]):
sample_weights_info[i] = class_weights.get(labels[i])
return sample_weights_info