机器学习实例——信用评级全流程实现


上篇博文我们对汽车违约数据进行了预测与变量筛选
数据挖掘实例——信用评级
现在我们来做一个全流程的机器学习项目
源数据集与源代码可以在GitHub中下载然后本地调试
GitHub地址

读取并选择数据

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
accepts = pd.read_csv('accepts.csv')
rejects = pd.read_csv('rejects.csv')

accepts_x = accepts[["tot_derog","age_oldest_tr","rev_util","fico_score","ltv"]]
accepts_y = accepts['bad_ind']
rejects_x = rejects[["tot_derog","age_oldest_tr","rev_util","fico_score","ltv"]]

在这里插入图片描述

定义缺失值替换函数并填补缺失值

def Myfillna_median(df):
    for i in df.columns:
        median = df[i].median()
        df[i].fillna(value=median, inplace=True)
    return df
accepts_x_filled=Myfillna_median(df=accepts_x)
rejects_x_filled=Myfillna_median(df=rejects_x)

标准化数据

from sklearn.preprocessing import Normalizer
accepts_x_norm = pd.DataFrame(Normalizer().fit_transform(accepts_x_filled))
accepts_x_norm.columns = accepts_x_filled.columns

rejects_x_norm = pd.DataFrame(Normalizer().fit_transform(rejects_x_filled))
rejects_x_norm.columns = rejects_x_filled.columns

利用knn模型进行预测,做拒绝推断

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5, weights='distance')
neigh.fit(accepts_x_norm, accepts_y)
rejects['bad_ind'] = neigh.predict(rejects_x_norm)

将审核通过的申请者和未通过的申请者进行合并

rejects_res = rejects[rejects['bad_ind'] == 0].sample(1340)
rejects_res = pd.concat([rejects_res, rejects[rejects['bad_ind'] == 1]], axis = 0)
data = pd.concat([accepts.iloc[:, 2:-1], rejects_res.iloc[:,1:]], axis = 0)

分类变量转换

bankruptcy_dict = {'N':0, 'Y':1}
data.bankruptcy_ind = data.bankruptcy_ind.map(bankruptcy_dict)

处理异常值

year_min = data.vehicle_year.quantile(0.1)
year_max = data.vehicle_year.quantile(0.99)
data.vehicle_year = data.vehicle_year.map(lambda x: year_min if x <= year_min else x)
data.vehicle_year = data.vehicle_year.map(lambda x: year_max if x >= year_max else x)
data.vehicle_year = data.vehicle_year.map(lambda x: 2018 - x)
data.drop(['vehicle_make'], axis = 1, inplace = True)
data_filled=Myfillna_median(df=data)
X = data_filled[['age_oldest_tr', 'bankruptcy_ind', 'down_pyt', 'fico_score',
       'loan_amt', 'loan_term', 'ltv', 'msrp', 'purch_price', 'rev_util',
       'tot_derog', 'tot_income', 'tot_open_tr', 'tot_rev_debt',
       'tot_rev_line', 'tot_rev_tr', 'tot_tr', 'used_ind', 'veh_mileage',
       'vehicle_year']]
y = data_filled['bad_ind']
print(data_filled)
  age_oldest_tr  bad_ind  bankruptcy_ind  down_pyt  fico_score  loan_amt  \ 0              64.0        1             0.0      0.00    

650.0 17200.00 1 240.0 0 0.0 683.54 649.0 19588.54 2 60.0 1 0.0 0.00 613.0 10500.00 3 35.0 1 0.0 3099.00 603.0 10800.00 4 104.0 0 0.0 0.00 764.0 26328.04 … … … … … … … 4171 103.0 1
0.0 2000.00 552.0 19000.00 4183 142.0 1 0.0 0.00 718.0 15121.58 4186 32.0 1 0.0 2136.00 624.0 15000.00 4194 101.0 1 0.0 2429.67 662.0 18700.00 4208 52.0 1 0.0 2503.00 711.0 10000.00

  loan_term    ltv     msrp  purch_price  ...  tot_derog  tot_income  \ 0            36   99.0  17350.0     17200.00  ...       

7.0 6550.00 1 60 99.0 19788.0 19588.54 … 0.0 4666.67 2 60 92.0 11450.0 13595.00 … 7.0 2000.00 3 60 118.0 12100.0 12999.00 … 3.0 1500.00 4 60 122.0 22024.0 26328.04 … 0.0 4144.00 … … … … … … … … 4171 60 86.0 22000.0 21500.00 …
10.0 1775.00 4183 60 123.0 12325.0 15528.15 … 1.0 2916.67 4186 42 97.0 15525.0 17136.00 … 2.0 2750.00 4194 48 100.0 18700.0 21129.67 … 12.0 5000.00 4208 36 83.0 12000.0 14503.00 … 0.0 1280.00

  tot_open_tr  tot_rev_debt  tot_rev_line  tot_rev_tr  tot_tr  used_ind  \ 0             2.0         506.0         500.0         1.0 

9.0 1 1 11.0 34605.0 57241.0 7.0 21.0 0 2 5.0 3076.5 10009.5 3.0 10.0 1 3 5.0 4019.0 5946.0 4.0 10.0 1 4 2.0 0.0 1800.0 0.0 10.0 0 … … … … … … … 4171 8.0 6085.0 4803.0
6.0 27.0 0 4183 7.0 8760.0 10685.0 5.0 13.0 1 4186 4.0 851.0 500.0 1.0 9.0 1 4194 6.0 6195.0 9951.0 3.0 31.0 0 4208 5.0 0.0 2000.0 0.0 2.0 1

  veh_mileage  vehicle_year   0         24000.0          20.0   1            22.0          18.0   2         19600.0          20.0   3         10000.0          21.0   4            14.0          18.0   ...           ...           ...   4171          1.0          18.0   4183     

27975.0 20.0 4186 75000.0 24.0 4194 0.0 18.0 4208 100141.0 24.0

[7533 rows x 21 columns]

利用随机森林填补变量

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(X,y)

importances = list(clf.feature_importances_)
importances_order = importances.copy()
importances_order.sort(reverse=True)

cols = list(X.columns)
col_top = []
for i in importances_order[:9]:
    col_top.append((i,cols[importances.index(i)]))
col_top

col = [i[1] for i in col_top]
print(col_top)

[(0.2703856544549641, ‘fico_score’), (0.12407590399997459,
‘age_oldest_tr’), (0.11587163522770565, ‘rev_util’),
(0.09324645022037789, ‘ltv’), (0.08054739301439076, ‘tot_derog’),
(0.07645932670619422, ‘tot_rev_line’), (0.06170165695082909,
‘tot_tr’), (0.03363202702336891, ‘tot_rev_debt’),
(0.026431913888006843, ‘purch_price’)]

变量细筛与数据清洗

from woe import WoE
import warnings
warnings.filterwarnings("ignore")
data_filled.head()
iv_c = {}
for i in col:
    try:
        iv_c[i] = WoE(v_type='c').fit(data_filled[i],data_filled['bad_ind']).optimize().iv 
    except:
        print(i)
    
pd.Series(iv_c).sort_values(ascending=False)

rev_util tot_rev_line tot_rev_debt

fico_score 0.464300 age_oldest_tr 0.200235 tot_derog
0.179381 ltv 0.145189 tot_tr 0.107821 purch_price 0.019769 dtype: float64

WOE转换

WOE_c = data_filled[col].apply(lambda col:WoE(v_type='c',qnt_num=5).fit(col,data_filled['bad_ind']).optimize().fit_transform(col,data_filled['bad_ind']))
WOE_c.head()

在这里插入图片描述

划分数据集

from sklearn.model_selection import  train_test_split
X = WOE_c
y = data_filled['bad_ind']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)

def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

构建逻辑回归模型,进行违约概率预测

import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,recall_score,classification_report 
lr = LogisticRegression(C = 1, penalty = 'l1')
lr.fit(X_train,y_train.values.ravel())
y_pred = lr.predict(X_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

在这里插入图片描述

加入代价敏感参数,重新计算

import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,recall_score,classification_report 
lr = LogisticRegression(C = 1, penalty = 'l1', class_weight='balanced')
lr.fit(X_train,y_train.values.ravel())
y_pred = lr.predict(X_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

在这里插入图片描述

检验模型

from sklearn.metrics import roc_curve, auc
fpr,tpr,threshold = roc_curve(y_test,y_pred, drop_intermediate=False) ###计算真正率和假正率  
roc_auc = auc(fpr,tpr) ###计算auc的值  
  
plt.figure()  
lw = 2  
plt.figure(figsize=(10,10))  
plt.plot(fpr, tpr, color='darkorange',  
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线  
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')  
plt.xlim([0.0, 1.0])  
plt.ylim([0.0, 1.05])  
plt.xlabel('False Positive Rate')  
plt.ylabel('True Positive Rate')  
plt.title('Receiver operating characteristic example')  
plt.legend(loc="lower right")  
plt.show()

在这里插入图片描述

利用sklearn.metrics中的roc_curve算出tpr,fpr作图

fig, ax = plt.subplots()
ax.plot(1 - threshold, tpr, label='tpr') # ks曲线要按照预测概率降序排列,所以需要1-threshold镜像
ax.plot(1 - threshold, fpr, label='fpr')
ax.plot(1 - threshold, tpr-fpr,label='KS')
plt.xlabel('score')
plt.title('KS Curve')
#plt.xticks(np.arange(0,1,0.2), np.arange(1,0,-0.2))
#plt.xticks(np.arange(0,1,0.2), np.arange(score.max(),score.min(),-0.2*(data['反欺诈评分卡总分'].max() - data['反欺诈评分卡总分'].min())))
plt.figure(figsize=(20,20))
legend = ax.legend(loc='upper left', shadow=True, fontsize='x-large')

plt.show()

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值