评价函数
此函数用于评价外部数据:主要指标有:PSI,相关系数、互信息指标、KS、GINI、IV、AUC、f1_score,此外还有其他描述性指标。
# psi score cal
def psi_cal(score_col,label_col,date_col,data):
'''
score_col:str,外部数据列名称
label_col:str,标签数据列名称
date_col:str,日期数据名称
data:dataFrame,df文件
'''
base_data = data[data[date_col] < 20210800]
pre_data = data[data[date_col] >= 20210800]
base_data_cut,bins = pd.qcut(base_data[score_col],10,duplicates='drop',retbins=True)# cut and return bins
base_bad_rate = base_data.flag.groupby(base_data_cut).mean() # cal base_data bin bad rate
pre_data_cut = pd.cut(pre_data[score_col],bins,include_lowest=True)
pre_bad_rate = pre_data.flag.groupby(pre_data_cut).mean() # cal pre_data bin bad rate
PSI = (base_bad_rate.values - pre_bad_rate.values)*np.log(base_bad_rate.values/pre_bad_rate.values)
psi_value = PSI.sum()
return psi_value
# 评价函数
def socre_test(test,label):
'''
单项数据的验证
col:待检测数据 -- pd.Series
label:Y值 -- pd.Series
return : 各项描述指标和评价指标 -- dict
'''
# 计算查得率
rate_response = (test.notna().sum()/len(test))*100
# 剔除未查到样本
test = test[test.notna()]
label = label[test.index]
# 描述性指标计算
range_,min_,max_,mean_,std_,cov = test.max() -test.min(),test.min(),test.max(),test.mean(),test.std(),test.std()/test.mean() # 描述统计指标
# 相关系数指标
corr = test.corr(label) #
# 互信息指标
mic_value= mic(test.values.reshape(-1,1),label.values.reshape(-1,1))
mic_value = mic_value[0]
# 按分位数进行分箱
distribute_table = label.groupby(pd.qcut(test,10,duplicates='drop')).agg(['count','mean'])
# IV 计算
distribute_table.rename(columns={'count':'all_num','mean':'bad_rate'},inplace=True)
distribute_table['good_rate'] = 1 - distribute_table['bad_rate']
distribute_table['bad_num'] = distribute_table.all_num * distribute_table.bad_rate
distribute_table['good_num'] = distribute_table.all_num * distribute_table.good_rate
woe = np.log((distribute_table.bad_num/distribute_table.bad_num.sum())/(distribute_table.good_num/distribute_table.good_num.sum()))
iv = ((distribute_table.bad_num/distribute_table.bad_num.sum()) - (distribute_table.good_num/distribute_table.good_num.sum()))*woe
iv_value = iv.sum()
# KS计算
ks_bad_rate = distribute_table['all_num']*distribute_table['bad_rate']/label.sum()
ks_good_rate = (distribute_table['all_num']*distribute_table['good_rate'])/(len(label) - label.sum())
ks_value = abs(ks_bad_rate.cumsum() - ks_good_rate.cumsum()).max()
# Gini 系数
distribute_table['gini'] = 1 - ((distribute_table['bad_rate']*distribute_table['bad_rate']) + (distribute_table['good_rate']*distribute_table['good_rate']))
gini_value = sum(distribute_table['gini']*distribute_table['all_num']/len(label))
# AUC计算
score = test.apply(lambda x:( x- min_)/(max_ - min_))
if corr < 0:
score = 1- score
auc_score = roc_auc_score(label[score.sort_values().index],score.sort_values())
# f1 socre
if corr > 0:
point = test.quantile(0.9)
y_pred = test.apply(lambda x: 1 if x >= point else 0)
else:
point = test.quantile(0.1)
y_pred = test.apply(lambda x: 1 if x <= point else 0)
f1score = f1_score(label,y_pred)
return {'查得率':rate_response,'corr':corr,'mic':mic_value,'ks':ks_value,"iv":iv_value,"gini":gini_value,"auc":auc_score,'f1_socre':f1score,'极差':range_,'最大值':max_,"最小值":min_,"均值":mean_,"变异系数":cov }
}