思维导图:
说明:文档全部数据已脱敏
前期数据准备工作:
数据形态:
data.shape
发现只有两个缺失值,用众数----'bachelor’填充:
data.info()
data[data.最高学历.isnull()]
data.最高学历.value_counts()
data['最高学历']=data.最高学历.fillna('bachelor')
查看样本是否均衡:
data.performance.value_counts()[0]/data.shape[0]
查看各变量的不同水平值个数,方便区分连续型变量和分类型变量(分箱的时候也需要):
data_copy=data.copy()
allFeatures = list(data.columns)
allFeatures.remove('履约')
for i in allFeatures:
print('变量 {} 的不同水平值有 {} 个'.format(i,len(data[i].unique())))
查看异常值:
箱线图:
上图发现数据存在异常值,需要进行异常值处理:
X.describe([0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99]).T
# cook值、标准化残差、杠杆值绘图
s = out.iloc[:,[3,4,5]]
sns.pairplot(s)
plt.show()
# 提取出去掉异常值的数据
for i in range(1):
y_i = out.loc[leverage < 2 * np.mean(leverage),:] # 杠杆值小于2倍的均值(杠杆值之前赋值给了leverage)
x_i = y_i.loc[np.abs(y_i.standard_resid)< 2,:] # 标准化残差小于2倍标准差
z_i = x_i.loc[x_i.cooks_d < 4 / (np.shape(s)[0]-np.shape(s)[1]-1),:] # cook值小于4/n-k-1,n\k用np.shape(s)求
z_i
print(z_i.shape)
将标签列0,1互换,方便后面计算:
data.履约.value_counts()
将分类数据编码:
# 编码
for i in data:
if data[i].dtypes=='object' or data[i].dtypes=='bool':
data[i] = OrdinalEncoder().fit_transform(data[[i]])
最后数据形态:
data.shape
尝试建模看准确度:
'''建模'''
X=data.iloc[:,:-1]
y=data.iloc[:,-1]
from sklearn.model_selection import train_test_split as sp
X_train, X_test, y_train, y_test = sp(X, y, test_size=0.3, random_state=1)
from sklearn.linear_model import LogisticRegression as LR
lr=LR(random_state=1)
lr.fit(X_train, y_train)
from sklearn import metrics
y_test_label = lr.predict(X_test)
y_test_value = lr.predict_proba(X_test)[:, 1]
print("测试集准确率是:{:.2%}".format(metrics.accuracy_score(y_test,y_test_label)))
print("测试集AUC是: {:.4}".format(metrics.roc_auc_score(y_test, y_test_value)))
特征列重要性排序:
allFeatures = list(data.columns)
allFeatures.remove('履约')
X = data[allFeatures]
y = data['履约']
from sklearn.model_selection import train_test_split as sp
X_train, X_test, y_train, y_test = sp(X, y, test_size=0.3, random_state=1)
import lightgbm as LGB
params = {'objective': 'binary',
"boosting" : "gbdt",
'num_leaves': 4,
'min_data_in_leaf': 20,
"subsample": 0.9,
"colsample_bytree": 0.8,
'learning_rate':0.09,
'tree_learner': 'voting',
'metric': 'auc'}
dtrain = LGB.Dataset(X_train, y_train)
dtest = LGB.Dataset(X_test, y_test, reference=dtrain)
lgb = LGB.train(params, dtrain, valid_sets=[dtrain, dtest],
num_boost_round=3000, early_stopping_rounds=100, verbose_eval=10)
'''lightgbm重要性选择变量'''
importace = list(lgb.feature_importance())
allFeatures=list(X.columns)
featureImportance = zip(allFeatures,importace)
featureImportanceSorted = sorted(list(zip(allFeatures,importace)),key=lambda k: k[1],reverse=True)
plt.figure(figsize = (5, 10)) sns.barplot(x=[k[1] for k in featureImportanceSorted],y=[k[0] for k in featureImportanceSorted])
plt.xticks(rotation='vertical')
plt.show()
feature_selection_lgb = [k[0] for k in featureImportanceSorted[:16]] #选择前17个重要的变量
根据上图,选取前17个变量。
多重共线性处理:
#查看相关性的
data888= data[['性别', '最高学历', '大学类型', '借款用途', '担保', '项目金额', '收益', '周期_月', '月还本息','总还款本息', '借款描述字数', '累积借款金额', '累积还款金额', '累积逾期金额', '累积已还次数', '累积逾期次数','累积逾借金额比', '累积逾期次数比']]
corr1=data888.corr()
plt.figure(figsize = (15, 10))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr1, mask= mask, cmap=cmap, center=0, annot =True, cbar_kws={"shrink": .5})
plt.show()
x=data[['累积借款金额','累积还款金额','累积逾期金额','累积已还次数','累积逾期次数','累积逾借金额比','累积逾期次数比', '总还款本息','项目金额','收益']]
from sklearn.decomposition import PCA
pca = PCA(n_components=3).fit(x)
pca.explained_variance_
pca.explained_variance_ratio_
pca=PCA(n_components=3).fit_transform(x)
x[['z1','z2','z3']]=pd.DataFrame(pca)
x[['z1','z2','z3']].head()
data1=data[['性别', '最高学历', '大学类型', '借款用途', '担保', '周期_月', '月还本息', '借款描述字数', '履约']]
data1[['z1','z2','z3']]=x[['z1','z2','z3']]
data1.columns
决策树分箱:
from sklearn import tree
from sklearn.model_selection import train_test_split
X = data.iloc[:,[5,6,8,9,10]]
Y = data.iloc[:,-4]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.3,random_state=1)
clf = tree.DecisionTreeClassifier()
网格搜索
from sklearn.model_selection import GridSearchCV
# 设置参数可取值
param_grid = {'criterion':['entropy','gini'],
'max_depth':range(2,10),
'min_samples_leaf':range(1,10),
'min_samples_split':range(2,30,2)}
reg = GridSearchCV(tree.DecisionTreeClassifier(),param_grid,cv=5)
# 建模,拟合训练数据
reg.fit(Xtrain,Ytrain)
reg.best_params_
clf = tree.DecisionTreeClassifier(criterion= 'entropy',max_depth= 5,min_samples_leaf=6,min_samples_split= 26)
clf.fit(Xtrain,Ytrain)
clf.score(Xtest,Ytest)
import graphviz
feature_name = ['月还款本息','借款描述字数','Z1','Z2','Z3']
dot_data = tree.export_graphviz(clf #训练好的模型
,out_file = None #是否输出文件名
,filled=True #每个节点是否填充颜色
,feature_names= feature_name
,rounded=True #节点框的形状,是否棱角弧度
)
graph = graphviz.Source(dot_data)
graph
分界点图形验证和图形分组:
借款用途:
sns.distplot(data1['借款用途'][data1['履约'] == 0].dropna(),color='blue')
sns.distplot(data1['借款用途'][data1['履约'] == 1].dropna(),color='red')
plt.show()
bins=[-1,4,10]
cats=pd.cut(list(data1['借款用途']), bins, precision=0) #指定分组区间
cats.value_counts()
借款描述字数:
sns.distplot(data1['借款描述字数'][data1['履约'] == 0].dropna(),color='blue')
sns.distplot(data1['借款描述字数'][data1['履约'] == 1].dropna(),color='red')
plt.show()
bins=[-2,-0.604,-0.298,-0.114,1.946,10]
cats=pd.cut(list(data1['借款描述字数']), bins, precision=0) #指定分组区间
cats.value_counts()
sns.distplot(data1['z1'][data1['履约'] == 0].dropna(),color='blue')
sns.distplot(data1['z1'][data1['履约'] == 1].dropna(),color='red')
plt.show()
bins=[-5,-0.945,-0.344,0.117,20]
cats=pd.cut(list(data1['z1']), bins, precision=0) #指定分组区间
cats.value_counts()
sns.distplot(data1['z2'][data1['履约'] == 0].dropna(),color='blue')
sns.distplot(data1['z2'][data1['履约'] == 1].dropna(),color='red')
plt.show()
bins=[-5,0.152,20]
cats=pd.cut(list(data1['z2']), bins, precision=0) #指定分组区间
cats.value_counts()
sns.distplot(data1['z3'][data1['履约'] == 0].dropna(),color='blue')
sns.distplot(data1['z3'][data1['履约'] == 1].dropna(),color='red')
plt.show()
bins=[-5,0.1421,0.48,20]
cats=pd.cut(list(data1['z3']), bins, precision=0) #指定分组区间
cats.value_counts()
分组完成后的数据:
data1.head()
WOE值计算:
# 计算WOE
def CalcWOE(df, col, target):
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
N = sum(regroup['total'])
B = sum(regroup['bad'])
regroup['good'] = regroup['total'] - regroup['bad']
G = N - B
regroup['bad_pcnt'] = regroup['bad'].map(lambda x: x*1.0/B)
regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G)
regroup['WOE'] = regroup.apply(lambda x: np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
WOE_dict = regroup[[col,'WOE']].set_index(col).to_dict(orient='index')
for k, v in WOE_dict.items():
WOE_dict[k] = v['WOE']
IV = regroup.apply(lambda x: (x.good_pcnt-x.bad_pcnt)*np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
IV = sum(IV)
return {"WOE": WOE_dict, 'IV':IV}
data3=data1[['借款用途_组别', '月还本息_组别', '借款描述字数_组别', 'z1_组别', 'z2_组别', 'z3_组别','性别', '最高学历', '大学类型', '担保']]
woe_1= {}
iv_1= {}
for i in data3:
dar=CalcWOE(data1, i, '履约')
woe_1[i] = dar['WOE']
iv_1[i] = dar['IV']
IV值:IV值大于0.02的特征列保留
iv_1
# 逻辑回归建模
'''选取IV>=0.02的变量'''
IV_sorted = sorted(iv_1.items(), key=lambda x: x[1], reverse=True)
IV_values = [i[1] for i in IV_sorted ]
IV_name = [i[0] for i in IV_sorted ]
high_IV = {k:v for k, v in iv_1.items() if v >= 0.02}
high_IV_sorted = sorted(high_IV.items(),key=lambda x:x[1],reverse=True)
print ('总共',len(high_IV_sorted),'个变量IV >= 0.02')
WOE值:各组类别的WOE值
woe_1
创建WOE变量
def f(x): #替换woe值函数
for i,j in woe_1[var].items():
if x==i:
return j
'''创建WOE变量'''
short_list = high_IV.keys()
short_list_2 = []
for var in short_list:
newVar = var + '_WOE'
data1[newVar] = data1[var].map(f)
short_list_2.append(newVar)
data1.columns
data1[['借款用途_组别_WOE', '月还本息_组别_WOE', '借款描述字数_组别_WOE', 'z1_组别_WOE', 'z2_组别_WOE','z3_组别_WOE', '性别_WOE', '最高学历_WOE', '大学类型_WOE', '担保_WOE']].head()
WOE值柱状图:
list(woe_1['z1_组别'].values())
pd.DataFrame(zip(list(woe_1['z1_组别'].key()),list(woe_1['z1_组别'].values()))).plot(kind='bar')
逻辑回归建模:
daaa.iloc[data1[data1['最高学历']==4].index.tolist(),-3]=-100
sadfadfs=daaa[['月还本息_组别_WOE', '借款描述字数_组别_WOE', 'z1_组别_WOE', 'z2_组别_WOE', 'z3_组别_WOE','性别_WOE', '最高学历_WOE', '大学类型_WOE', '担保_WOE']]
'''建模'''
X=sadfadfs
y=daaa[['履约']]
from sklearn.model_selection import train_test_split as sp
X_train, X_test, y_train, y_test = sp(X, y, test_size=0.3, random_state=1)
from sklearn.linear_model import LogisticRegression as LR
lr=LR(random_state=1)
lr.fit(X_train, y_train)
from sklearn import metrics
y_test_label = lr.predict(X_test)
y_test_value = lr.predict_proba(X_test)[:, 1]
print("测试集准确率是:{:.2%}".format(metrics.accuracy_score(y_test,y_test_label)))
print("测试集AUC是: {:.4}".format(metrics.roc_auc_score(y_test, y_test_value)))
logistic模型客群变化的敏感度不如其他高复杂度模型,因此稳健更好,鲁棒性更强。
模型直观。系数含义好阐述、易理解。对金融领域高管以及银行出身的建模专家,变量系数可以跟他们的业内知识做交叉验证,更容易让人信服。
KS曲线:
ROC曲线:
算分数:
b=lr.intercept_ #截距
coe=lr.coef_ #系数
a0 = coe[0][0] #借款用途_组别_WOE系数
a1 = coe[0][1] #月还本息_组别_WOE系数
a2 = coe[0][2] #借款描述字数_组别_WOE系数
a3 = coe[0][3] #z1_组别_WOE系数
a4 = coe[0][4] #z2_组别_WOE系数
a5 = coe[0][5] #z3_组别_WOE系数
a6 = coe[0][6] #性别_WOE系数
a7 = coe[0][7] #最高学历_WOE系数
a8 = coe[0][8] #大学类型_WOE系数
a9 = coe[0][9] #担保_WOE系数
A = 500
PDO = 20 #每增加20分,odds(好坏比)增加1倍
B=PDO/np.log(2)
for j in range(len(woe_1['z1_组别'])):
woe1 = [i for i in woe_1['z1_组别'].values()][j]
print('第',j,'区间得分:',-(B * a0* woe1) + (A-B*b)/dataWOE.shape[1])
woe_1['z1_组别'] # 获取字典key,即变量水平值
for j in range(len(woe_1['z1_组别'])):
woe1 = [i for i in woe_1['z1_组别'].values()][j]
print('第',j,'区间得分:',-(B * a0* woe1) + (A-B*b)/dataWOE.shape[1])
sns.kdeplot(daret.loc[(daret.履约==1),'总分'],shade=True,color='b')
sns.kdeplot(daret.loc[(daret.履约==0),'总分'],shade=True,color='r')
x=[('借款用途_组别','借款用途_组别score'),('月还本息_组别','月还本息_组别score'),('借款描述字数_组别','借款描述字数_组别score'),('z1_组别','z1_组别score'),('z2_组别','z2_组别score'),('z3_组别','z3_组别score')]
for j in range(len(x)):
print(x[j][0])
for i in range(len(gg[x[j][0]].unique().tolist())):
a=gg[x[j][0]].unique().tolist()[i]
print(a,list(gg.loc[gg[x[j][0]]=a,x[j][1]].unique()))