import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
data=pd.read_excel(r'E:\lll\20200311人工客群分布\sx_all_data_0311.xlsx')#导入数据
variable=pd.read_excel(r'E:\lll\20200311人工客群分布\变量类型.xlsx')#变量列表
continue_variable=list(variable['连续变量'])#获取连续变量列表
'''
连续型变量iv计算
'''
def feature_woe_iv(x: pd.Series, y: pd.Series, nan: float = -999.) -> pd.DataFrame:
'''
计算变量各个分箱的WOE、IV值,返回一个DataFrame
'''
x = x.fillna(nan)
boundary = optimal_binning_boundary(x, y, nan) # 获得最优分箱边界值列表
df = pd.concat([x, y], axis=1) # 合并x、y为一个DataFrame,方便后续计算
df.columns = ['x', 'y'] # 特征变量、目标变量字段的重命名
df['bins'] = pd.cut(x=x, bins=boundary, right=False) # 获得每个x值所在的分箱区间
grouped = df.groupby('bins')['y'] # 统计各分箱区间的好、坏、总客户数量
result_df = grouped.agg([('good', lambda y: (y == 0).sum()),
('bad', lambda y: (y == 1).sum()),
('total', 'count')])
result_df['good_pct'] = result_df['good'] / result_df['good'].sum() #好客户占比
result_df['bad_pct'] = result_df['bad'] / result_df['bad'].sum()# 坏客户占比
result_df['total_pct'] = result_df['total'] / result_df['total'].sum()#总客户占
result_df['IV'] =result_df['iv'].sum()
result_df['变量'] =i
result_df=result_df.reset_index()
return result_df
a=[]
for i in list(continue_variable['连续变量']):
a.append(feature_woe_iv(x=data[i],y=data['dpd30']))
'''
离散型变量iv计算
'''
de_variable=list(continue_variable['离散变量'].unique())
def de_variable(data):
for i in de_variable:
povit1=pd.pivot_table(data,index=[i],values=['contract_no','dpd30'],
aggfunc={r'contract_no':'count',r'dpd30':np.sum})
povit1=povit1.rename(columns={'contract_no':'total','dpd30':'bad'})
povit1['good'] = povit1['total'] - povit1['bad'].sum()# 好客户占比
povit1['good_pct'] = povit1['good'] / povit1['good'].sum() # 好客户占比
povit1['bad_pct'] = povit1['bad'] / povit1['bad'].sum() # 坏客户占比
povit1['total_pct'] = povit1['total'] / povit1['total'].sum()# 总客户占比
povit1['bad_rate'] = povit1['bad'] / povit1['total'] # 坏比率
povit1['woe'] = np.log(povit1['good_pct'] / povit1['bad_pct'])# WOE
povit1['iv'] = (povit1['good_pct'] - povit1['bad_pct']) * povit1['woe'] # IV
povit1['IV'] =povit1['iv'].sum()
povit1['变量'] =i
povit1=povit1.reset_index()
povit1=povit1.rename(columns={i:'bins'})
a.append(povit1)
iv_result=pd.concat(a,axis=0)#合并所有变量iv
iv_result.to_excel(r'E:\lusiru\20200311人工客群分布\iv_result.xlsx')
Python 批量计算变量iv值
最新推荐文章于 2021-04-08 10:35:48 发布