'''标签编码(LabelEncoder)'''
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit([1,5,67,100])
print(le.classes_)
le.transform([1,1,100,67,5])
''' 独热编码(OneHotEncoder)'''
from sklearn import preprocessing
ohe = preprocessing.OneHotEncoder()
ohe.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
ohe.transform([[0, 1, 3]]).toarray()
'''WOE编码:有监督的编码方式,将预测类别的集中度的属性作为编码的数值
优势:将特征的值规范到相近的尺度上,具有业务含义(经验上将,WOE的绝对值波动范围在0.1~3)
缺陷:需要每箱中同时包含好、坏两个类别'''
def CalcWOE(df, col, target):
'''
:param df: dataframe containing feature and target
:param col: 注意col这列已经经过分箱了,现在计算每箱的WOE和总的IV。
:param target: good/bad indicator
:return: 返回每箱的WOE(字典类型)和总的IV之和。
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
N = sum(regroup['total'])
B = sum(regroup['bad'])
regroup['good'] = regroup['total'] - regroup['bad']
G = N - B
regroup['bad_pcnt'] = regroup['bad'].map(lambda x: x*1.0/B)
regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G)
regroup['WOE'] = regroup.apply(lambda x: np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
WOE_dict = regroup[[col,'WOE']].set_index(col).to_dict(orient='index')
IV = regroup.apply(lambda x: (x.good_pcnt-x.bad_pcnt)*np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
IV = sum(IV)
return {"WOE": WOE_dict, 'IV':IV}