- 数据来源
https://pan.baidu.com/s/1wO9qJRjnrm8uhaSP67K0lw
- 任务
数据集是金融数据(非原始数据,已经处理过了),我们要做的是预测贷款用户是否会逾期。表格中 "status" 是结果标签:0表示未逾期,1表示逾期。
特征选择:分别用IV值和随机森林进行特征选择。再用7个模型(逻辑回归、SVM、决策树、随机森林、GBDT、XGBoost和LightGBM),进行模型评估。
1.IV值和WOE
- IV值
IV的全称是Information Value,中文意思是信息价值,或者信息量。IV可以用来衡量自变量的预测能力。
- WOE值
WOE的全称是“Weight of Evidence”,即证据权重。WOE是对原始自变量的一种编码形式。
要对一个变量进行WOE编码,需要首先把这个变量进行分组处理(也叫离散化、分箱等等,说的都是一个意思)。分组后,对于第i组,WOE的计算公式如下:
其中,pyi是这个组中响应客户(风险模型中,对应的是违约客户,总之,指的是模型中预测变量取值为“是”或者说1的个体)占所有样本中所有响应客户的比例,pni是这个组中未响应客户占样本中所有未响应客户的比例,#yi是这个组中响应客户的数量,#ni是这个组中未响应客户的数量,#yT是样本中所有响应客户的数量,#nT是样本中所有未响应客户的数量。
WOE表示的实际上是“当前分组中响应客户占所有响应客户的比例”和“当前分组中没有响应的客户占所有没有响应的客户的比例”的差异。
对上述公式进行变换后如下:
WOE也可以这么理解,他表示的是当前这个组中响应的客户和未响应客户的比值,和所有样本中这个比值的差异。这个差异是用这两个比值的比值,再取对数来表示的。WOE越大,这种差异越大,这个分组里的样本响应的可能性就越大,WOE越小,差异越小,这个分组里的样本响应的可能性就越小。
- IV值计算
对于一个分组后的变量,对于分组i,会有一个对应的IV值,即:
有了一个变量各分组的IV值,我们就可以计算整个变量的IV值,即:
(n为变量分组个数)
2.数据读入以及包导入
from pandas import Series,DataFrame
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
#数据读入
data = pd.read_csv(r'data.csv',encoding='gbk')
缺失值和重复率统计
#缺失值和重复值统计
stats = []
for col in X_train.columns:
stats.append((col, X_train[col].nunique(), X_train[col].isnull().sum() * 100 / X_train.shape[0], X_train[col].value_counts(normalize=True, dropna=False).values[0] * 100, X_train[col].dtype))
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)[:10]
stats = []
for col in X_test.columns:
stats.append((col, X_test[col].nunique(), X_test[col].isnull().sum() * 100 / X_test.shape[0], X_test[col].value_counts(normalize=True, dropna=False).values[0] * 100, X_test[col].dtype))
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)[:10]
3.特征选择
- IV值特征选择:
yT = y_train.sum()
nT = y_train.count()-yT
def calc_WOE(data, col, target):
#分组后统计
gp = target.groupby(data[col])
yi = DataFrame(gp.sum())
ni = DataFrame(gp.count()-gp.sum())
data = DataFrame(pd.merge(yi,ni,how='left',left_index=True,right_index=True))
#计算WOE:
data['response'] = data.apply(lambda x: round(x.status_x*1.0/yT,100),axis=1)
data['non_response'] = data.apply(lambda x: round(x.status_y*1.0/nT,100),axis=1)
data['WOE'] = data.apply(lambda x: np.log(x.response/x.non_response),axis=1)
return data
def calc_IV(data):
data['IV'] = data.apply(lambda x:(x.response-x.non_response)*x.WOE,axis=1)
return data['IV'].sum()
#利用IV值筛选特征
data_IV = DataFrame()
fea_iv = []
X_num = X_train.select_dtypes(include = ['number']).copy()
num_cols = list(X_num.columns)
for col in good_cols:
#print col
if col in num_cols:
cats = DataFrame(pd.qcut(X_train[col],20,labels=False,duplicates='drop'))
col_WOE = calc_WOE(cats,col,y_train)
else:
col_WOE = calc_WOE(X_train,col,y_train)
IV = calc_IV(col_WOE)
if(IV>0.1):
data_IV[col] = [IV]
fea_iv.append(col)
print fea_iv
- 随机森林特征选择:
#随机森林选特征
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)
clf_rf_impc = pd.Series(clf_rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
fea_gini = rfc_impc[:20].index.tolist()
print(fea_gini)
4.特征合成
features = list(set(fea_gini)|set(fea_iv))
X_new = X[features]
print(X_new.shape)
5.模型训练
#SVM
clf_svm = svm.SVC(random_state = 2018)
clf_svm.fit(X_train,y_train)
#训练集预测标签和概率输出
train_svm_predict = clf_svm.predict(X_train)
train_svm_predict_pro = clf_svm.decision_function(X_train)
#测试集预测标签和概率输出
test_svm_predict = clf_svm.predict(X_test)
test_svm_predict_pro = clf_svm.decision_function(X_test)
#训练集评分
model_evaluation(y_train,train_svm_predict,train_svm_predict_pro)
#测试集评分
model_evaluation(y_test,test_svm_predict,test_svm_predict_pro)
#决策树
clf_tree = tree.DecisionTreeClassifier(random_state = 2018)
clf_tree.fit(X_train,y_train)
#训练集预测标签和概率输出
train_tree_predict = clf_tree.predict(X_train)
train_tree_predict_pro = clf_tree.predict_proba(X_train)[:,1]
#测试集预测标签和概率输出
test_tree_predict = clf_tree.predict(X_test)
test_tree_predict_pro = clf_tree.predict_proba(X_test)[:,1]
#训练集评分
model_evaluation(y_train,train_tree_predict,train_tree_predict_pro)
#测试集评分
model_evaluation(y_test,test_tree_predict,test_tree_predict_pro)
#LR
clf_lr = LogisticRegression(random_state=2018)
clf_lr.fit(X_train,y_train)
#训练集预测标签和概率输出
train_lr_predict = clf_lr.predict(X_train)
train_lr_predict_pro = clf_lr.predict_proba(X_train)[:,1]
#测试集预测标签和概率输出
test_lr_predict = clf_lr.predict(X_test)
test_lr_predict_pro = clf_lr.predict_proba(X_test)[:,1]
#训练集评分
model_evaluation(y_train,train_lr_predict,train_lr_predict_pro)
#测试集评分
model_evaluation(y_test,test_lr_predict,test_lr_predict_pro)
#随机森林
clf_rf = RandomForestClassifier(random_state = 2011)
clf_rf.fit(X_train,y_train)
#训练集预测标签和概率输出
train_rf_predict = clf_rf.predict(X_train)
train_rf_predict_pro = clf_rf.predict_proba(X_train)[:,1]
#测试集预测标签和概率输出
test_rf_predict = clf_rf.predict(X_test)
test_rf_predict_pro = clf_rf.predict_proba(X_test)[:,1]
#训练集评分
model_evaluation(y_train,train_rf_predict,train_rf_predict_pro)
#测试集评分
model_evaluation(y_test,test_rf_predict,test_rf_predict_pro)
#GBDT
clf_gbdt = GradientBoostingClassifier()
clf_gbdt.fit(X_train,y_train)
#训练集预测标签和概率输出
train_gbdt_predict = clf_gbdt.predict(X_train)
train_gbdt_predict_pro = clf_gbdt.predict_proba(X_train)[:,1]
#测试集预测标签和概率输出
test_gbdt_predict = clf_gbdt.predict(X_test)
test_gbdt_predict_pro = clf_gbdt.predict_proba(X_test)[:,1]
#训练集评分
model_evaluation(y_train,train_gbdt_predict,train_gbdt_predict_pro)
#测试集评分
model_evaluation(y_test,test_gbdt_predict,test_gbdt_predict_pro)
#XGBoost
clf_xgb = xgb.XGBClassifier()
clf_xgb.fit(X_train,y_train)
#训练集预测标签和概率输出
train_xgb_predict = clf_xgb.predict(X_train)
train_xgb_predict_pro = clf_xgb.predict_proba(X_train)[:,1]
#测试集预测标签和概率输出
test_xgb_predict = clf_xgb.predict(X_test)
test_xgb_predict_pro = clf_xgb.predict_proba(X_test)[:,1]
#训练集评分
model_evaluation(y_train,train_xgb_predict,train_xgb_predict_pro)
#测试集评分
model_evaluation(y_test,test_xgb_predict,test_xgb_predict_pro)
#lightGBM
clf_lgb = lgb.LGBMClassifier()
clf_lgb.fit(X_train,y_train)
#训练集预测标签和概率输出
train_lgb_predict = clf_lgb.predict(X_train)
train_lgb_predict_pro = clf_lgb.predict_proba(X_train)[:,1]
#测试集预测标签和概率输出
test_lgb_predict = clf_lgb.predict(X_test)
test_lgb_predict_pro = clf_lgb.predict_proba(X_test)[:,1]
#训练集评分
model_evaluation(y_train,train_lgb_predict,train_lgb_predict_pro)
#测试集评分
model_evaluation(y_test,test_lgb_predict,test_lgb_predict_pro)
6.模型评估
#评分和绘制ROC曲线
def model_evaluation(y_label,y_predict,y_predict_pro):
accuracy = accuracy_score(y_label,y_predict)
precision = precision_score(y_label,y_predict)
recall = recall_score(y_label,y_predict)
f1 = f1_score(y_label,y_predict)
auc = roc_auc_score(y_label,y_predict_pro)
fpr,tpr,thresholds = roc_curve(y_label,y_predict_pro)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
print 'accuracy:',accuracy
print 'precision:',precision
print 'recall:',recall
print 'f1_score:',f1
print 'roc_auc_score:',auc
7.问题
目前的在随机森林选择特征处报错,任在调试。模型训练和评估是算法实践的结果