【数据挖掘算法竞赛】山东山东省-公积金贷款逾期预测TOP8 baseline523
公众号: ChallengeHub
赛题链接
http://data.sd.gov.cn/cmpt/cmptDetail.html?id=26
赛题背景
维持和发展信用关系,是保护社会经济秩序的重要前提。随着金融市场的发展,信贷业务日益增多,金融机构迫切需要了解信贷主体的信息情况,对信贷资产的安全性、信贷主体的偿债能力给与科学评价,最大限度地防范贷款逾期风险。
赛题数据
评价指标
本次比赛成绩排名根据测试集的在公积金逾期风险监控中,需要尽可能做到尽可能少的误伤和尽可能准确地探测,于是我们选择“在FPR较低时的TPR加权平均值”作为平均指标。
给定一个阀值,可根据混淆矩阵计算TPR(覆盖率)和FPR(打扰率) TPR = TP /(TP + FN) FPR = FP /(FP + TN) 其中,TP、FN、FP、TN分别为真正例、假反例、假正例、真反例。 这里的评分指标,首先计算了3个覆盖率TPR: TPR1:FPR=0.001时的TPR TPR2:FPR=0.005时的TPR TPR3:FPR=0.01时的TPR 最终成绩= 0.4 * TPR1 + 0.3 * TPR2 + 0.3 * TPR3 代码如下:
def tpr_weight_funtion(y_true,y_predict):
d = pd.DataFrame()
d['prob'] = list(y_predict)
d['y'] = list(y_true)
d = d.sort_values(['prob'], ascending=[0])
y = d.y
PosAll = pd.Series(y).value_counts()[1]
NegAll = pd.Series(y).value_counts()[0]
pCumsum = d['y'].cumsum()
nCumsum = np.arange(len(y)) - pCumsum + 1
pCumsumPer = pCumsum / PosAll
nCumsumPer = nCumsum / NegAll
TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3
解决方案
读取数据
train = pd.read_csv('../公积金逾期预测-数据/train.csv')
test = pd.read_csv('../公积金逾期预测-数据/test.csv')
submit = pd.read_csv('../公积金逾期预测-数据/submit.csv')
train.shape, test.shape, submit.shape
特征工程
一些业务特征
df['DKFFE_DKYE'] = df['DKFFE'] + df['DKYE']
df['DKFFE_DKY_multi_DKLL'] = (df['DKFFE'] + df['DKYE']) * df['DKLL']
df['DKFFE_multi_DKLL'] = df['DKFFE'] * df['DKLL']
df['DKYE_multi_DKLL'] = df['DKYE'] * df['DKLL']
df['GRYJCE_DWYJCE'] = df['GRYJCE'] + df['DWYJCE']
df['GRZHDNGJYE_GRZHSNJZYE'] = df['GRZHDNGJYE'] + df['GRZHSNJZYE']
df['DKFFE_multi_DKLL_ratio'] = df['DKFFE'] * df['DKLL'] / df['DKFFE_DKY_multi_DKLL']
df['DKYE_multi_DKLL_ratio'] = df['DKYE'] * df['DKLL'] / df['DKFFE_DKY_multi_DKLL']
df['DKYE_DKFFE_ratio'] = df['DKYE'] / df['DKFFE_DKYE']
df['DKFFE_DKYE_ratio'] = df['DKFFE'] / df['DKFFE_DKYE']
df['GRZHYE_diff_GRZHDNGJYE'] = df['GRZHYE'] - df['GRZHDNGJYE']
df['GRZHYE_diff_GRZHSNJZYE'] = df['GRZHYE'] - df['GRZHSNJZYE']
df['GRYJCE_DWYJCE_ratio'] = df['GRYJCE'] / df['GRYJCE_DWYJCE']
df['DWYJCE_GRYJCE_ratio'] = df['DWYJCE'] / df['GRYJCE_DWYJCE']
分箱特征
由于本题目的是预测公积金贷款是否会逾期,所以可以针对不同的用户画像和贷款金额等做出分箱特征,后面还可以做分箱之后的交叉等,这里只展示了age、贷款余额的。
1def get_age(df,col = 'age'):
2 df[col+"_genFeat1"]=(df['age'] > 18).astype(int)
3 df[col+"_genFeat2"]=(df['age'] > 25).astype(int)
4 df[col+"_genFeat3"]=(df['age'] > 30).astype(int)
5 df[col+"_genFeat4"]=(df['age'] > 35).astype(int)
6 df[col+"_genFeat5"]=(df['age'] > 40).astype(int)
7 df[col+"_genFeat6"]=(df['age'] > 45).astype(int)
8 return df, [col + f'_genFeat{i}' for i in range(1, 7)]
9
10df['age'] = ((1609430399 - df['CSNY']) / (365 * 24 * 3600)).astype(int)
11df, genFeats1 = get_age(df, col = 'age')
12
13sns.distplot(df['age'][df['age'] > 0])
1def get_daikuanYE(df,col):
2 df[col + '_genFeat1'] = (df[col] > 100000).astype(int)
3 df[col + '_genFeat2'] = (df[col] > 120000).astype(int)
4 df[col + '_genFeat3'] = (df[col] > 140000).astype(int)
5 df[col + '_genFeat4'] = (df[col] > 180000).astype(int)
6 df[col + '_genFeat5'] = (df[col] > 220000).astype(int)
7 df[col + '_genFeat6'] = (df[col] > 260000).astype(int)
8 df[col + '_genFeat7'] = (df[col] > 300000).astype(int)
9 return df, [col + f'_genFeat{i}' for i in range(1, 8)]
10
11df, genFeats2 = get_daikuanYE(df, col = 'DKYE')
12df, genFeats3 = get_daikuanYE(df, col = 'DKFFE')
13
14
15plt.figure(figsize = (8, 2))
16plt.subplot(1,2,1)
17sns.distplot(df['DKYE'][df['label'] == 1])
18plt.subplot(1,2,2)
19sns.distplot(df['DKFFE'][df['label'] == 1])
类别特征count、count ratio、onehot编码等
1for f in tqdm(cate_cols):
2 df[f] = df[f].map(dict(zip(df[f].unique(), range(df[f].nunique()))))
3 df[f + '_count'] = df[f].map(df[f].value_counts())
4 df = pd.concat([df,pd.get_dummies(df[f],prefix=f"{f}")],axis=1)
5
6
7cate_cols_combine = [[cate_cols[i], cate_cols[j]] for i in range(len(cate_cols)) \
8 for j in range(i + 1, len(cate_cols))]
9
10
11for f1, f2 in tqdm(cate_cols_combine):
12 df['{}_{}_count'.format(f1, f2)] = df.groupby([f1, f2])['id'].transform('count')
13 df['{}_in_{}_prop'.format(f1, f2)] = df['{}_{}_count'.format(f1, f2)] / df[f2 + '_count']
14 df['{}_in_{}_prop'.format(f2, f1)] = df['{}_{}_count'.format(f1, f2)] / df[f1 + '_count']
类别特征与数值特征交叉
1for f1 in tqdm(cate_cols):
2 g = df.groupby(f1)
3 for f2 in num_cols + gen_feats:
4 for stat in ['sum', 'mean', 'std', 'max', 'min', 'std']:
5 df['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
6 for f3 in genFeats2 + genFeats3:
7 for stat in ['sum', 'mean']:
8 df['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
9
10num_cols_gen_feats = num_cols + gen_feats
11for f1 in tqdm(num_cols_gen_feats):
12 g = df.groupby(f1)
13 for f2 in num_cols_gen_feats:
14 if f1 != f2:
15 for stat in ['sum', 'mean', 'std', 'max', 'min']:
16 df['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
删除类别特征个数只有单个的
1drop_feats = [f for f in train_df.columns if train_df[f].nunique() == 1 or train_df[f].nunique() == 0]
2len(drop_feats), drop_feats
生成训练集、测试集
1train_df = df[df['label'].isna() == False].reset_index(drop=True)
2test_df = df[df['label'].isna() == True].reset_index(drop=True)
3display(train_df.shape, test_df.shape)
LGB Model
这里只用了LGB模型,可以考虑跑多个模型然后集成
1oof = np.zeros(train_df.shape[0])
2# feat_imp_df = pd.DataFrame({'feat': cols, 'imp': 0})
3test_df['prob'] = 0
4clf = LGBMClassifier(
5 learning_rate=0.05,
6 n_estimators=10230,
7 num_leaves=31,
8 subsample=0.8,
9 colsample_bytree=0.8,
10 random_state=1023,
11 metric=None
12)
13
14val_aucs = []
15seeds = [1023, 2048, 2098]
16for seed in seeds:
17 skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
18 for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
19 print('--------------------- {} fold ---------------------'.format(i))
20 t = time.time()
21 trn_x, trn_y = train_df[cols].iloc[trn_idx].reset_index(drop=True), train_df['label'].values[trn_idx]
22 val_x, val_y = train_df[cols].iloc[val_idx].reset_index(drop=True), train_df['label'].values[val_idx]
23 clf.fit(
24 trn_x, trn_y,
25 eval_set=[(val_x, val_y)],
26 eval_metric='auc',
27 early_stopping_rounds=200,
28 verbose=200
29 )
30 oof[val_idx] = clf.predict_proba(val_x)[:, 1]
31 test_df['prob'] += clf.predict_proba(test_df[cols])[:, 1] / skf.n_splits / len(seeds)
32
33 cv_auc = roc_auc_score(train_df['label'], oof)
34 val_aucs.append(cv_auc)
35 print('\ncv_auc: ', cv_auc)
36print(val_aucs, np.mean(val_aucs))
TPR指标验证
1print(val_aucs, np.mean(val_aucs))
2def tpr_weight_funtion(y_true,y_predict):
3 d = pd.DataFrame()
4 d['prob'] = list(y_predict)
5 d['y'] = list(y_true)
6 d = d.sort_values(['prob'], ascending=[0])
7 y = d.y
8 PosAll = pd.Series(y).value_counts()[1]
9 NegAll = pd.Series(y).value_counts()[0]
10 pCumsum = d['y'].cumsum()
11 nCumsum = np.arange(len(y)) - pCumsum + 1
12 pCumsumPer = pCumsum / PosAll
13 nCumsumPer = nCumsum / NegAll
14 TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
15 TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
16 TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
17
18 return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3
19
20tpr = round(tpr_weight_funtion(train_df['label'], oof), 6)
21tpr, round(np.mean(val_aucs), 5)`
生成结果文件
1submit['id'] = test_df['id']
2submit['label'] = test_df['prob']
3
4submit.to_csv('../sub/submission{}_{}.csv'.format(tpr, round(np.mean(val_aucs), 6)), index = False)
5submit.head()
写在最后
线下AUC940,TPR491,线上TPR523+,开源的时候位于top8。
提分点在于:由于是公积金逾期预测,因此贷款系列的特征能够很好表征用户行为,可以挖掘更多关于贷款的业务特征;另一个点在于评价指标是tpr指标,可以考虑去优化tpr指标。
完整代码链接,关注公众号ChallengeHub回复"山东公积金"获取代码链接.
欢迎扫码关注ChallengeHub公众号
欢迎加入ChallengeHub学习交流群