为便于工作中计算KS和IV等值,编写一套可以沿用的代码,方便随时使用。
#第一步,读取数据
import pandas as pd
import numpy as np
df=pd.read_excel(r'C:\Users\***\a.xlsx')
#第二步:查看所有字段的字段类型
df.dtypes
#新建一个DataFrame,记载各变量的数据类型,便于后续筛选
df_var=pd.DataFrame(df.columns,columns=['var'])
df_type=pd.DataFrame(df.dtypes,columns=['type'])
#合并
df_var_type=pd.merge(df_var,df_type,how='left',left_on='var',right_index=True)
#字符型变量
list_object=list(df_var_type[df_var_type['type']=='object']['var'])
#数值型变量
list_num=list(df_var_type[(df_var_type['type']=='int64')|(df_var_type['type']=='float64')]['var'])
#第三步:分箱函数:两个:cut和qcut,即等宽和等频
#cut分箱:等宽
def cut_func(df_fx,n):
data=df_fx
cut=pd.DataFrame()
for i in data.columns:
if i in list_num:
var_num=len(list(data[i].unique()))
print(i,var_num)
if var_num>n:
cut[i]=pd.cut(data[i], n)
else:
cut[i]=data[i]
if i in list_object:
cut[i]=data[i]
return cut
#qcut分箱:等频
def qcut_func(df_fx,n):
data=df_fx
cut=pd.DataFrame()
for i in data.columns:
if i in list_num:
var_num=len(list(data[i].unique()))
print(i,var_num)
if var_num>n:
cut[i]=pd.qcut(data[i], n,duplicates='drop')
else:
cut[i]=data[i]
if i in list_object:
cut[i]=data[i]
return cut
#第四步:对数据进行分箱,可以根据需求进行筛选使用哪种分箱方法
df_cut=qcut_func(df,10) #等频
df_cut=cut_func(df,10) #等宽
#添加或定义Y列
df_cut['Y']=df['15A']
#第五步:分组求值
def ks_iv(df):
data=df_cut.copy()
data3=pd.DataFrame()
#data=data.fillna('null')
for i in data.columns:
total=data.groupby(i)[i].count()
total=pd.DataFrame({'total':total})
good=data[data['Y']==0].groupby(i)[i].count()
good=pd.DataFrame({'good':good})
data1=pd.merge(total,good,how='left',left_index=True,right_index=True)
data1=data1.fillna(0)
data1['var']=i
data1.reset_index(inplace=True)
data1['cut']=data1[i]
data1['bad']=data1['total']-data1['good']
data1.reset_index(inplace=True)
data1['group_rate']=data1['total']/len(data)
data1['bad_rate']=data1['bad']/data1['total']
data1['woe']=np.log((data1['bad']/data1['bad'].sum())/(data1['good']/data1['good'].sum()))
data1['iv']=data1['woe']*(data1['bad']/data1['bad'].sum()-data1['good']/data1['good'].sum())
data1.replace([-np.Inf,np.Inf],0,inplace=True)
data1['total_good']=data1['good'].cumsum()
data1['total_bad']=data1['bad'].cumsum()
data1['total_good%']=data1['total_good']/data1['good'].sum()
data1['total_bad%']=data1['total_bad']/data1['bad'].sum()
data1['ks']=abs(data1['total_bad%']-data1['total_good%'])
data_z=pd.DataFrame([[i,'总计',data1['total'].sum(),data1['bad'].sum(),data1['good'].sum(),data1['group_rate'].sum()
,data1['bad'].sum()/data1['total'].sum(),0,data1['iv'].sum()
,max(data1['total_good%']),max(data1['total_bad%']),max(data1['ks'])]]
,columns=['var','cut','total','bad','good','group_rate','bad_rate','woe','iv','total_good%','total_bad%','ks'])
data2=pd.concat([data1,data_z])
data3=pd.concat([data2,data3])
#data2.reset_index(inplace=True)
#data2=pd.DataFrame(data2,columns=['var','total','good','bad','group_rate','bad_rate','woe','iv','total_good%','total_bad%','ks'])
#data2=pd.DataFrame(data2,columns=['var','cut','total','good','bad','iv','total_good%','total_bad%','ks'])
data3=pd.DataFrame(data3,columns=['var','cut','total','good','bad','group_rate','bad_rate','woe','iv','total_good%','total_bad%','ks'])
#data2.set_index(["var"], inplace=True)
#data_new=pd.concat([data2,data_z])
# data_new=pd.DataFrame(data_new,columns=['var','cut','total','good','bad','group_rate','bad_rate','woe','iv','total_good%','total_bad%','ks'])
return data3
#最后一步,计算
ks_iv(df_cut)