阿里AI天池大赛-贷款违约预测-基于CatBoost十折交叉验证

1:报名地址

https://tianchi.aliyun.com/competition/entrance/531830/introduction

2:排名分数 

3:模型源码

        废话不多说,直接上源码 

import pandas as pd
import datetime
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score

## 数据降维处理的
from sklearn.model_selection import train_test_split  
from catboost import CatBoostClassifier

#数据加载
train=pd.read_csv("./train.csv")
testA=pd.read_csv("./testA.csv")

numerical_fea = list(train.select_dtypes(exclude=['object']).columns)
numerical_fea.remove('isDefault')
train[numerical_fea] = train[numerical_fea].fillna(train[numerical_fea].median())
testA[numerical_fea] = testA[numerical_fea].fillna(testA[numerical_fea].median())


#数据清洗特征添加
for data in [train]:
    #贷款发放的月份时间格式处理
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    
    #贷款发放的月份与设置的时间差
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
    #贷款等级
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
    #就业年限(年)
    data['employmentLength'] = data['employmentLength'].map({'1 year':1,'2 years':2,'3 years':3,'4 years':4,'5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'10+ years':10,'< 1 year':0})
    data['subGrade'] = data['subGrade'].map({'E2':1,'D2':2,'D3':3,'A4':4,'C2':5,'A5':6,'C3':7,'B4':8,'B5':9,'E5':10,
        'D4':11,'B3':12,'B2':13,'D1':14,'E1':15,'C5':16,'C1':17,'A2':18,'A3':19,'B1':20,
        'E3':21,'F1':22,'C4':23,'A1':24,'D5':25,'F2':26,'E4':27,'F3':28,'G2':29,'F5':30,
        'G3':31,'G1':32,'F4':33,'G4':34,'G5':35})    
    #借款人最早报告的信用额度开立的月份与当前时间差
    data['earliesCreditLine_Year'] = data['earliesCreditLine'].apply(lambda x: 2021-int(x[-4:]))
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
    #借款人信用档案中当前的信用额度总数 除以 贷款金额
    data['rato']=data['totalAcc']/data['loanAmnt']
    
#数据清洗特征添加
for data in [testA]:
     
    #贷款发放的月份时间格式处理
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    
    #贷款发放的月份与设置的时间差
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
    #贷款等级
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
    #就业年限(年)
    data['employmentLength'] = data['employmentLength'].map({'1 year':1,'2 years':2,'3 years':3,'4 years':4,'5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'10+ years':10,'< 1 year':0})
    data['subGrade'] = data['subGrade'].map({'E2':1,'D2':2,'D3':3,'A4':4,'C2':5,'A5':6,'C3':7,'B4':8,'B5':9,'E5':10,
        'D4':11,'B3':12,'B2':13,'D1':14,'E1':15,'C5':16,'C1':17,'A2':18,'A3':19,'B1':20,
        'E3':21,'F1':22,'C4':23,'A1':24,'D5':25,'F2':26,'E4':27,'F3':28,'G2':29,'F5':30,
        'G3':31,'G1':32,'F4':33,'G4':34,'G5':35})    
    #借款人最早报告的信用额度开立的月份与当前时间差
    data['earliesCreditLine_Year'] = data['earliesCreditLine'].apply(lambda x: 2021-int(x[-4:]))
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
    #借款人信用档案中当前的信用额度总数 除以 贷款金额
    data['rato']=data['totalAcc']/data['loanAmnt']


sub=testA[['id']].copy()
sub['isDefault']=0
testA=testA.drop(['id','issueDate'],axis=1)
data_x=train.drop(['isDefault','id','issueDate'],axis=1)
data_y=train[['isDefault']].copy()
x, val_x, y, val_y = train_test_split(  
    data_x,  
    data_y,  
    test_size=0.25,  
    random_state=1,  
    stratify=data_y
)  

col=['grade','subGrade','employmentTitle','homeOwnership','verificationStatus','purpose','postCode','regionCode',
     'initialListStatus','applicationType','policyCode']

for i in data_x.columns:
    if i in col:
        data_x[i] = data_x[i].astype('str')
for i in testA.columns:
    if i in col:
        testA[i] = testA[i].astype('str')

#CatBoost模型
model=CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            task_type="CPU",
            learning_rate=0.1,
            iterations=1000,
            random_seed=2021,
            od_type="Iter",
            depth=7)

answers = []
mean_score = 0

#十折交叉校验
n_folds =10
i=1
sk = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2021)
for train, test in sk.split(data_x, data_y):  
    x_train = data_x.iloc[train]
    y_train = data_y.iloc[train]
    x_test = data_x.iloc[test]
    y_test = data_y.iloc[test]
    clf = model.fit(x_train,y_train, eval_set=(x_test,y_test),verbose=500,cat_features=col)
    yy_pred_valid=clf.predict(x_test,prediction_type='Probability')[:,-1]
    print('cat验证的auc:{}'.format(roc_auc_score(y_test, yy_pred_valid)))
    mean_score += roc_auc_score(y_test, yy_pred_valid) / n_folds
    y_pred_valid = clf.predict(testA,prediction_type='Probability')[:,-1]
    answers.append(y_pred_valid) 
    i=i+1
print('mean valAuc:{}'.format(mean_score))
cat_pre=sum(answers)/n_folds
sub['isDefault']=cat_pre
sub.to_csv('./baseline-0.7420.csv',index=False) 

 4:提分要领 

        1:特征的提取

                特征的提取会对分数的提高提供很大的帮助,例如本例中的

                贷款发放的月份时间格式处理

                贷款发放的月份与设置的时间差

                贷款等级

                就业年限(年)

                借款人最早报告的信用额度开立的月份与当前时间差

        2:CatBoost模型祖传参数的微调

        3:十折交叉校验

                经过测试本实例中十折的分数略高于五折,因此我们为了提分,首要选择模型特征处理,再进行参数微调,在模型调整趋于稳定之后,再进行K折校验

5:相关知识补充

        1:CatBoost模型相关知识可以参考另外一位大神的文章

        https://blog.csdn.net/abcdefg90876/article/details/103220983
        2:CatBoostClassifier/CatBoostRegressor 通用参数 

        loss_function:损失函数,字符串 (分类任务,default= Logloss ,回归任务,default= RMSE )
        eval_metric:过拟合检测或者最优模型选择的评估指标
        task_type : 任务类型,CPU或者GPU,default=CPU
        learning_rate:学习率,default=0.03
        iterations:迭代次数, 解决机器学习问题能够构建的最大树的数目,default=1000
        random_seed : 随机数种子,default=0
        od_type : 过拟合检测类型,default=IncToDec
        depth : 树的深度,default=6       

  • 7
    点赞
  • 51
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

不要迷恋发哥

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值