- 数据来源
https://pan.baidu.com/s/1wO9qJRjnrm8uhaSP67K0lw
- 任务
数据集是金融数据(非原始数据,已经处理过了),我们要做的是预测贷款用户是否会逾期。表格中 "status" 是结果标签:0表示未逾期,1表示逾期。
数据类型转换和缺失值处理(尝试不同的填充看效果)以及及其他你能借鉴的数据探索。
-
实现过程
- 数据读入
from pandas import Series,DataFrame
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
#数据读入
data = pd.read_csv(r'data.csv',encoding='gbk')
- 数据清洗
#删除无关列(完全相同的列和人以及账号相关信息)
data.drop(['custid', 'trade_no', 'bank_card_no', 'id_name'],axis=1,inplace=True)
X = data.drop(['status'],axis=1)
y = data['status']
one_value_col = []
for col in X:
if len(X[col].unique()) == 1:
one_value_col.append(col)
for col in one_value_col:
X.drop(col,axis = 1, inplace = True)
- 缺失值填充
#分别取出不同类型的数据
X_num = X.select_dtypes(include = ['number']).copy()
X_oth = X.select_dtypes(exclude = ['number']).copy()
#缺失值统计
X_num_null = ((X_num.shape[0]-X_num.count())/X_num.shape[0]).sort_values(ascending = False)
X_oth_null = ((X_oth.shape[0]-X_oth.count())/X_oth.shape[0]).sort_values(ascending = False)
#缺失值填充
X_num.fillna(value={'student_feature': 0},inplace=True)
#采用众数填充(避免了某些特别大或者特别小的异常值的影响)(尤其是非数值型数据)
X_num.fillna(X_num.mode().iloc[0,:],inplace=True)
X_oth.fillna(X_oth.mode().iloc[0,:],inplace=True)
- 类别特征采用one-hot编码,日期特征进行相应解析
#对于reg_preference_for_trad 进行one-hot编码
X_oth_new = pd.get_dummies(X_oth['reg_preference_for_trad'])
#日期解析
loans_latest_tim=pd.DatetimeIndex(X_oth['loans_latest_time'])
X_oth_new['year_loans_latest']=loans_latest_tim.year
X_oth_new['month_loans_latest']=loans_latest_tim.month
X_oth_new['day_loans_latest']=loans_latest_tim.day
latest_query_time=pd.DatetimeIndex(X_oth['latest_query_time'])
X_oth_new['year_latest_query']=latest_query_time.year
X_oth_new['month_latest_query']=latest_query_time.month
X_oth_new['day_latest_query']=latest_query_time.day
- 数据集划分
#预处理后的数据
X_new = pd.concat([X_num,X_oth_new],axis=1)
#数据集划分
X_train,X_test,y_train,y_test = train_test_split(X_new,y,test_size = 0.3,random_state = 2018)
- 评价函数
#评分和绘制ROC曲线
def model_evaluation(y_label,y_predict,y_predict_pro):
accuracy = accuracy_score(y_label,y_predict)
precision = precision_score(y_label,y_predict)
recall = recall_score(y_label,y_predict)
f1 = f1_score(y_label,y_predict)
auc = roc_auc_score(y_label,y_predict_pro)
fpr,tpr,thresholds = roc_curve(y_label,y_predict_pro)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
print 'accuracy:',accuracy
print 'precision:',precision
print 'recall:',recall
print 'f1_score:',f1
print 'roc_auc_score:',auc
- 模型训练和结果输出
clf_lr = LogisticRegression(random_state=2018)
clf_lr.fit(X_train,y_train)
#测试集预测标签和概率输出
test_lr_predict = clf_lr.predict(X_test)
test_lr_predict_pro = clf_lr.predict_proba(X_test)[:,1]
#评分
model_evaluation(y_test,test_lr_predict,test_lr_predict_pro)
- 最终结果的准确率不高:
accuracy: 0.748423265592
对特征的构建以及相关性分析不足。曾在冗余和无效特征干扰较明显