数据挖掘:贷款违约预测

数据

数据来源阿里天池学习赛:零基础入门金融风控-贷款违约预测

代码

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,accuracy_score
import joblib
from catboost import CatBoostClassifier
%matplotlib inline

filename = r'C:\Users\liuhao\Desktop\新建文件夹\贷款违约预测\train.csv'
filename1 = r'C:\Users\liuhao\Desktop\新建文件夹\贷款违约预测\testA.csv'
data = pd.read_csv(filename)
data1 = pd.read_csv(filename1)
combined_data = pd.concat([data.drop(labels='isDefault',axis=1),data1],axis=0,sort=True)

def feature_cut(data,fea_list):
    for feature in fea_list:
        data[feature+'_cut'] = pd.qcut(data[feature],q=200,duplicates='drop')
        feature_value = data[feature+'_cut'].values.to_list()
        le = LabelEncoder()
        data[feature+'_cut'] = le.fit(feature_value).transform(feature_value)

fea_list = ['loanAmnt','installment','annualIncome','dti','revolBal','revolUtil']

cat_features = ['term','verificationStatus','employmentLength','initialListStatus', 
            	'grade', 'subGrade', 'issueDate_year','regionCode',
            	'issueDate_month', 'earliesCreditLine_year', 'postCode','earliesCreditLine_month',
            	'employmentTitle', 'purpose','title','loanAmnt_cut','installment_cut', 
            	'annualIncome_cut', 'dti_cut', 'revolBal_cut', 'revolUtil_cut']

employmentLength_dict = {'1 year':1,'10+ years':10,'2 years':2,'3 years':3,'4 years':4,
                         '5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'< 1 year':0}

month_dict = {'Aug': 8, 'May': 5, 'Jul': 7, 'Oct': 10, 'Dec': 12, 'Apr': 4, 
			  'Jan': 1, 'Nov': 11, 'Feb': 2,'Mar': 3, 'Jun': 6, 'Sep': 9}

feature_cut(combined_data,fea_list)
combined_data['employmentLength'] = combined_data['employmentLength'].map(employmentLength_dict)
combined_data['issueDate_year'] = combined_data['issueDate'].apply(lambda x:int(x.split('-')[0]))
combined_data['issueDate_month'] = combined_data['issueDate'].apply(lambda x:int(x.split('-')[1]))
combined_data['earliesCreditLine_year'] = combined_data['earliesCreditLine'].apply(lambda x: int(x.split('-')[1]))
combined_data['earliesCreditLine_month'] = combined_data['earliesCreditLine'].apply(lambda x: x.split('-')[0]).map(month_dict)
combined_data['year_gap'] = combined_data['issueDate_year'] - combined_data['earliesCreditLine_year']
combined_data['loan_annualIncome'] = combined_data['loanAmnt']/(combined_data['annualIncome']+1)
combined_data['pubRec_ratio'] = combined_data['pubRec']/(combined_data['pubRecBankruptcies']+1)
combined_data['openAcc_totalAcc'] = combined_data['openAcc']/combined_data['totalAcc']
combined_data['monthlyincome'] = combined_data['annualIncome']/12
combined_data['installment_monthlyincome'] = combined_data['installment']/(combined_data['monthlyincome']+1)

train = combined_data[:data.shape[0]]
test = combined_data[data.shape[0]:]

X = train.drop(labels=['loanAmnt','installment','annualIncome','dti','revolBal','revolUtil',
                       'id','issueDate', 'earliesCreditLine','pubRecBankruptcies',
                   	   'openAcc','policyCode','totalAcc','applicationType',
                       'ficoRangeHigh','n3','n10','n12','n0','n7'],axis=1)
y = data['isDefault']

X[cat_features] = X[cat_features].astype(str)

x_train,x_test,y_train,y_test = train_test_split(X,y,random_state=15,shuffle=True)

model = CatBoostClassifier(iterations=200,
                           learning_rate=0.1,
                           max_depth=8,
                           loss_function='Logloss',
                           eval_metric='AUC')
model.fit(x_train,y_train,cat_features=cat_features,eval_set=(x_test,y_test),verbose=False,use_best_model=True)

predprob = model.predict_proba(x_test)
print(model.score(x_train,y_train))
print(model.score(x_test,y_test))
print(roc_auc_score(y_test,predprob[:,1]))
print(model.score(X,y))

importance = list(zip(model.feature_names_,model.feature_importances_))
print(sorted(importance,key=lambda x:x[1],reverse=True))

xtest = test.drop(labels=['loanAmnt','installment','annualIncome','dti','revolBal','revolUtil',
                       'id','issueDate', 'earliesCreditLine','pubRecBankruptcies',
                   'openAcc','policyCode','totalAcc','applicationType',
                    'ficoRangeHigh','n3','n10','n12','n0','n7'],axis=1)

xtest[cat_features] = xtest[cat_features].astype(str)
predprob1 = model.predict_proba(xtest)

result = pd.DataFrame()
result['id'] = test['id']
result['isDefault'] = predprob1[:,1]
result.to_csv('catboost.csv',index=0)

说明

  • 最终提交结果线上AUC得分 0.7358,排名 269/6573。
  • 没有做数据清洗和异常值处理,只做了离散化处理,并且训练集和测试集一起,导致了数据泄露,结果得到了提升。
  • 没有做调参,模型参数是随便设置的,迭代次数也比较低。
  • 2
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值