一、项目介绍
阿里天池数据挖掘学习赛:贷款违约预测
学习赛主要为初学者准备的,论坛里有详细的项目建模过程。这里,仅记录一下自己在做这个项目时的一些思路。
二、数据分析
这里的数据分析过程主要参考有:
Datawhale零基础入门金融风控 Task2 数据分析
「机器学习」天池比赛:金融风控贷款违约预测
上面两个帖子讲述的已经非常详细,在这里就不在赘述了。
三、特征工程
1. 缺失值处理
先导入需要的库
import pandas as pd
import datetime
import warnings
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split # 分训练集和测试集
from catboost import CatBoostClassifier
import time
from sklearn.tree import DecisionTreeClassifier
首先,通过data.info()可知数据的特征类型分为数值型特征和类别型特征,并且这两种都存在缺失值;对于数值型特征直接采用中位数填充缺失值;类别型特征只有employLength存在缺失值,这里采用决策树去预测缺失值的方法进行填充。
然后,将类别型的特征(贷款等级/子等级、就业时长)通过编码转换为数值型。
最后,对日期型数据进行处理(发行日期、信用额度开立的月份),issue_days主要记录距离最早发行日期的天数。
import time
from sklearn.tree import DecisionTreeClassifier
def gradeTrans(x):
dict = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
result = dict[x]
return result
def subGradeTrans(x):
dict = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
result = dict[x[0]]
result = result * 5 + int(x[1])
return result
def employmentLength_deal(x):
if x == '< 1 year':
result = 0.5
elif x == '10+ years':
result = 12
else:
result = int(x.split(' ')[0][0])
# print(result)
return result
def transform_day(date1):
date2 = "2007-06-01"
date1 = time.strptime(date1, "%Y-%m-%d")
date2 = time.strptime(date2, "%Y-%m-%d")
# 根据上面需要计算日期还是日期时间,来确定需要几个数组段。下标0表示年,小标1表示月,依次类推...
# date1=datetime.datetime(date1[0],date1[1],date1[2],date1[3],date1[4],date1[5])
# date2=datetime.datetime(date2[0],date2[1],date2[2],date2[3],date2[4],date2[5])
date1 = datetime.datetime(date1[0], date1[1], date1[2])
date2 = datetime.datetime(date2[0], date2[1], date2[2])
# 返回两个变量相差的值,就是相差天数
# print((date2 - date1).days) # 将天数转成int型
return (date1 - date2).days
def earliesCreditLine_month_deal(x):
x = x.split('-')[0]
# print(x)
dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10,
'Nov': 11, 'Dec': 12}
result = dict[x]
return result
# 预处理
def data_preprocess():
# 加载数据
train_label = pd.read_csv('train.csv')['isDefault']
train = pd.read_csv('train.csv')
test = pd.read_csv('testA.csv')
# 拼接数据
data = pd.concat([train, test], axis=0, ignore_index=True) # 将训练数据与测试数据进行拼接
print('初始拼接后:', data.shape)
# 处理缺失值
# 数值型
numerical_fea = list(data.select_dtypes(exclude=['object']).columns) # 获取数值型特征对应的列
numerical_fea.remove('isDefault') # 将标签列移除
print(numerical_fea)
data[numerical_fea] = data[numerical_fea].fillna(data[numerical_fea].median()) # 用中位数填充缺失值
# 类别型
# 用决策树填补就业年限
empLenNotNullInd = data.employmentLength.notnull() # 不是空的行,返回True
columns = ['postCode','regionCode','employmentTitle','annualIncome'] # 用四个特征来预测employmentLength
data_empLen_X = data.loc[empLenNotNullInd,columns] # 取不为空的行,以及上述四列
data_empLen_y = data.employmentLength[empLenNotNullInd] # 取行不为空的标签(雇佣年限)
DTC = DecisionTreeClassifier() # 实例化
DTC.fit(data_empLen_X ,data_empLen_y) # 训练
print('决策树训练分数:',DTC.score(data_empLen_X ,data_empLen_y))
empLenIsNullInd = data.employmentLength.isnull() # 取就业年限为空的行
test_empLen_X = data.loc[empLenIsNullInd,columns]
empLen_pred = DTC.predict(test_empLen_X)
data.employmentLength[empLenIsNullInd] = empLen_pred
print('缺失值情况:', data.isnull().sum())
data['grade'] = data['grade'].apply(lambda x: gradeTrans(x)) # 对贷款等级进行编码 A:1 ...
data['subGrade'] = data['subGrade'].apply(lambda x: subGradeTrans(x)) # 对贷款子等级进行编码
print('1data.shape', data.shape)
data['employmentLength'] = data['employmentLength'].apply(lambda x: employmentLength_deal(x)) # 将就业时长(类别特征)转换为数值特征
data['issueDate_year'] = data['issueDate'].apply(lambda x: int(x.split('-')[0])) # 将贷款发行的时间拆分为年月天数(距离最早发行日期的天数2007-6-1
data['issueDate_month'] = data['issueDate'].apply(lambda x: int(x.split('-')[1]))
data['issueDate_day'] = data['issueDate'].apply(lambda x: transform_day(x)) # 到最早发行日2007-6-1的天数
# data['issueDate_week'] = data['issueDate_day'].apply(lambda x: int(x % 7) + 1)
print('2_data.shape', data.shape)
data['earliesCreditLine_year'] = data['earliesCreditLine'].apply(lambda x: 2022 - (int(x.split('-')[-1]))) # 最早信用额度开户时间距离现在的年数
data['earliesCreditLine_month'] = data['earliesCreditLine'].apply(lambda x: earliesCreditLine_month_deal(x)) #
data['earliesCreditLine_Allmonth'] = data['earliesCreditLine_year'] * 12 - data['earliesCreditLine_month'] # 最早信用额度开户时间距离现在的月数
del data['issueDate'], data['earliesCreditLine']
print('预处理完毕', data.shape)
return data, train_label
data, train_label = data_preprocess()
2. 自定义特征
# 自定义可解释特征
def gen_basicFea(data):
data['avg_income'] = data['annualIncome'] / data['employmentLength']
data['total_income'] = data['annualIncome'] * data['employmentLength'] # 入职以来的总收入
data['avg_loanAmnt'] = data['loanAmnt'] / data['term'] # 平均贷款金额
data['mean_interestRate'] = data['interestRate'] / data['term'] # 平均利率
data['all_installment'] = data['installment'] * data['term'] # 总的付款金额
data['rest_money_rate'] = data['avg_loanAmnt'] / (data['annualIncome'] + 0.1) # 287个收入为0
data['rest_money'] = data['annualIncome'] - data['avg_loanAmnt'] # 每年还款之后的剩余存款
data['closeAcc'] = data['totalAcc'] - data['openAcc'] # 档案中已结信用额度数量
data['ficoRange_mean'] = (data['ficoRangeHigh'] + data['ficoRangeLow']) / 2 # 借款人在贷款发放时的fico所属的均值
del data['ficoRangeHigh'], data['ficoRangeLow']
data['rest_pubRec'] = data['pubRec'] - data['pubRecBankruptcies'] # 未清除的贬损公共记录数量
data['rest_Revol'] = data['loanAmnt'] - data['revolBal'] # ????
data['dis_time'] = data['issueDate_year'] - (2022 - data['earliesCreditLine_year']) #???
data['rato']=data['totalAcc']/data['loanAmnt']
# for col in ['employmentTitle', 'grade', 'subGrade', 'regionCode', 'issueDate_day', 'postCode','term','homeOwnership']:
# data['{}_count'.format(col)] = data.groupby([col])['id'].transform('count')
return data
data = gen_basicFea(data)
这里主要参考了博客天池安泰杯金融科技挑战赛冠军方案,文章中提到了对匿名特征(n0-n14)进行暴力处理,以及进行组合交叉特征,因为我主要用的是catboost算法可以自行对指定的类别特征进行交叉,所以并没有进行处理。
3. 特征编码
由于catboost能够对指定的类别特征进行编码(包括one-hot方法、目标编码等),所以在这里并没有进行处理。如果想要使用xgboost算法则需要进行编码,编码方式可参照上述提到的天池安泰杯金融科技挑战赛冠军方案。
四、模型创建
这里主要参考博客阿里天池金融风控baseline
train = data[~data['isDefault'].isnull()].copy() # 训练数据
target = train_label # 标签
test = data[data['isDefault'].isnull()].copy() # 测试数据
sub=test[['id']].copy()
sub['isDefault']=0
train = train.drop(['isDefault','id','policyCode'],axis=1) # 训练数据
test=test.drop(['isDefault','id','policyCode'],axis=1)
print(train.shape)
print(test.shape)
# 定义需要编码的类别特征
col=['grade','subGrade','employmentTitle','homeOwnership','verificationStatus','purpose','postCode','regionCode',
'initialListStatus','applicationType','title']
# 将类别特征的数据类型转换为整型(也可转换为字符型)
for i in train.columns:
if i in col:
train[i] = train[i].astype('int64')
for i in test.columns:
if i in col:
test[i] = test[i].astype('int64')
# 创建模型
model=CatBoostClassifier(
loss_function="Logloss",
eval_metric="AUC",
task_type="CPU",
learning_rate=0.1,
iterations=1000,
random_seed=2020,
od_type="Iter",
depth=7)
# 采用十折交叉验证
answers = []
mean_score = 0
n_folds = 10
sk = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2021)
for train_, test_ in sk.split(train, train_label):
x_train = train.iloc[train_]
y_train = train_label.iloc[train_]
x_test = train.iloc[test_]
y_test = train_label.iloc[test_]
clf = model.fit(x_train,y_train, eval_set=(x_test,y_test),verbose=500,cat_features=col)
yy_pred_valid=clf.predict(x_test,prediction_type='Probability')[:,-1]
print('cat验证的auc:{}'.format(roc_auc_score(y_test, yy_pred_valid)))
mean_score += roc_auc_score(y_test, yy_pred_valid) / n_folds
y_pred_valid = clf.predict(test,prediction_type='Probability')[:,-1]
answers.append(y_pred_valid)
print('mean valAuc:{}'.format(mean_score))
# 存储预测结果
cat_pre=sum(answers)/n_folds
sub['isDefault']=cat_pre
sub.to_csv('金融预测.csv',index=False)
总结
- 没有进行异常值检测和特征选择,当特征数目过多时会影响评分速度
- catboost的参数没有进行细调(主要因为懒)
- 没有尝试模型融合,效果能达到0.742左右,感兴趣的小伙伴可以尝试一下模型融合看效果是否会提升。