赛题
比赛来源:https://www.dcjingsai.com/v2/cmptDetail.html?id=319
感谢大佬开源:
TOP1方案: 2019厦门国际银行“数创金融杯”数据建模大赛-top1
TOP2方案: 2019厦门国际银行“数创金融杯”数据建模大赛-top2
本博客只是单纯的对TOP方案进行整理学习。
数据总体概述本次数据共分为两个数据集,train_x.csv、train_target.csv和test_x.csv,其中train_x.csv为训练集的特征,train_target.csv为训练集的目标变量。
目标:是否有欺诈行为
TOP方案最大的特征就是分组进行计数编码目标编码
亮点一
对于强特lmt,分析训练集和测试集的分布:
train["oringin"]="train"
test["oringin"]="test"
data=pd.concat([train,test],axis=0,ignore_index=True)
#View data
# Explore feature distibution
#fig = plt.figure(figsize=(6, 6))
# for column in data.columns[0:-1]:
for column in ['lmt']:
g = sns.kdeplot(data[column][(data["oringin"] == "train")], color="Red", shade = True)
g = sns.kdeplot(data[column][(data["oringin"] == "test")], ax =g, color="Green", shade= True)
g.set_xlabel(column)
g.set_ylabel("Frequency")
g = g.legend(["train","test"])
plt.show()
观测到在5到6.5左右,训练集的lmt出现了测试集中没有的峰值波动,于是考虑到正负样本的比例,在这里扔掉一部分负样本中对应的lmt部分,使lmt分布保持一致。具体代码如下:
train1 = train.loc[train.target==0]
train2 = train.loc[train.target==1]
a = train1['lmt'].between(5,5.2)
train1 = train1[~a] # 丢弃5~5.2之间的值。
a = train1['lmt'].between(6,6.2) # 丢弃6~6.2之间的值。
train1 = train1[~a]
train = pd.concat([train1,train2],axis=0,ignore_index=True)
强特lmt是重要特征,于是进行额外处理
for i in ['certId','bankCard']:
# data['lmt_mean'+i] = data.groupby(i)['lmt'].transform('count')
data['lmt_nunique'+i] = data.groupby(i)['lmt'].transform('nunique') # 对每个地区(certId)求lmt的不同值个数
亮点二
构造缺失值计数和排序特征。
非缺失值:
count_fea = ['certId','dist','job','lmt','bankCard','residentAddr','setupHour','weekday','ethnic']
for i in count_fea:
data[i+'_count'] = data.groupby(i)['id'].transform('count')
data[i+'_rank'] = data.groupby(i)['id'].transform('rank')
缺失值:
# 缺失值统计,统计存在缺失值的特征,构造缺失值相关计数特征
loss_fea = ['bankCard','residentAddr','highestEdu','linkRela']
for i in loss_fea:
a = data.loc[data[i]==-999]
e = a.groupby(['certId'])['id'].count().reset_index(name=i+'_certId_count')
data = data.merge(e,on='certId',how='left')
d = a.groupby(['loanProduct'])['id'].count().reset_index(name=i+'_loan_count')
data = data.merge(d,on='loanProduct',how='left')
m = a.groupby(['job'])['id'].count().reset_index(name=i+'_job_count')
data = data.merge(m,on='job',how='left')
data['certloss_'+i] = data[i+'_certId_count']/data['certId_count']
data['jobloss_'+i] = data[i+'_job_count']/data['job_count']
亮点三
地区特征处理。
dist(地区),certId(身份证前6位),residengAddr(居住地)在意义上存在强相关性,于是根据实际意义进行处理。
data['distance'] = abs(data['certId']-data['dist']) #贷款地与原户籍地差
data['distance2'] = abs(data['certId']-data['residentAddr']) # 居住地与户籍地差异
亮点四
匿名特征处理。
(1)特征选择:参考代码
(2)围绕lmt和选择后的特征 feas_x 做一些特征(分组计数)
# 用晒选下来的b特征,围绕lmt和几个比较特别的特征做一些特征
for i in tqdm(feas_x):
data_train_test_all[i+'wk_cnt'] = data_train_test_all.groupby(['weekday',i])[i].transform('count')
data_train_test_all[i+'hour_cnt'] = data_train_test_all.groupby(['setupHour',i])[i].transform('count')
# data_train_test_all[i+'basicLevel_cnt'] = data_train_test_all.groupby(['basicLevel',i])[i].transform('count')
# data_train_test_all[i+'job_cnt'] = data_train_test_all.groupby(['job',i])[i].transform('count')
data_train_test_all[i+'lmt_mean'] = data_train_test_all.groupby([i])['lmt'].transform('mean')
data_train_test_all[i+'lmt_max'] = data_train_test_all.groupby([i])['lmt'].transform('max')
data_train_test_all[i+'lmt_min'] = data_train_test_all.groupby([i])['lmt'].transform('min')
data_train_test_all[i+'lmt_std'] = data_train_test_all.groupby([i])['lmt'].transform('std')
亮点五
各种count/目标count/count与目标count之间比例/ nunique。
cat_features_1 = ['dt_year','dt_month','job',
'basicLevel','ethnic','highestEdu',
'dist','gender','age','loanProduct',
'lmt','bankCard', 'residentAddr',
'linkRela','setupHour', 'weekday']
for i in tqdm(cat_features_1):
data_train_test_all[i+'_count1'] = data_train_test_all.groupby([i])[i].transform('count')
tmp = train1.groupby(i,as_index=False)['target'].agg({i+'_count2':'count',i+'_sum2':'sum'})
data_train_test_all =data_train_test_all.merge(tmp,how='left')
data_train_test_all[i+'_count2'] = data_train_test_all[i+'_count1'] /(data_train_test_all[i+'_count2'] +3)
cat_features_2 = ['dt_year','dt_month','job',
'basicLevel','ethnic','highestEdu',
'dist','gender','age','loanProduct',
'lmt','bankCard', 'residentAddr',
'linkRela','setupHour']
for i in tqdm(cat_features_2):
data_train_test_all[i+'weekday_count'] = data_train_test_all.groupby(['weekday',i])[i].transform('count')
cat_features_3 = ['dt_year','dt_month','job',
'basicLevel','ethnic','highestEdu',
'dist','gender','age',
'lmt','bankCard', 'residentAddr',
'linkRela','setupHour', 'weekday']
for i in tqdm(cat_features_3):
data_train_test_all[i+'loanProduct_count'] = data_train_test_all.groupby(['loanProduct',i])[i].transform('count')
tmp = train1.groupby(['loanProduct',i],as_index=False)['target'].agg({i+'loanProduct_count1':'count',i+'loanProduct_mn1':'mean'})
data_train_test_all =data_train_test_all.merge(tmp,how='left')
data_train_test_all[i+'loanProduct_count1'] = data_train_test_all[i+'loanProduct_count'] /(data_train_test_all[i+'loanProduct_count1'] +3)
del data_train_test_all[i+'loanProduct_count']
cat_features_4 = ['dt_year','dt_month','job',
'basicLevel','ethnic','highestEdu',
'dist','gender','age','loanProduct',
'lmt','bankCard',
'linkRela','setupHour', 'weekday']
for i in tqdm(cat_features_4):
data_train_test_all[i+'residentAddr_count'] = data_train_test_all.groupby(['residentAddr',i])[i].transform('count')
tmp = train1.groupby(['residentAddr',i],as_index=False)['target'].agg({i+'residentAddr_count1':'count'})
data_train_test_all =data_train_test_all.merge(tmp,how='left')
data_train_test_all[i+'residentAddr_count1'] = data_train_test_all[i+'residentAddr_count'] /(data_train_test_all[i+'residentAddr_count1'] +3)
del data_train_test_all[i+'residentAddr_count']
# 分组求nunique
unique_col = ['loanProduct','lmt','basicLevel','bankCard','residentAddr','linkRela','setupHour','weekday']
for i,col in enumerate(unique_col[:-1]):
for j,col1 in enumerate(unique_col[i+1:]):
data_train_test_all[col+col1+'_nunique'] = data_train_test_all.groupby([col])[col1].transform('nunique')
tmp = train1.groupby(col,as_index=False)[col1].agg({col+col1+'_nunique1':'nunique'})
data_train_test_all =data_train_test_all.merge(tmp,how='left')
data_train_test_all[col+col1+'_nunique1'] = data_train_test_all[col+col1+'_nunique'] /(data_train_test_all[col+col1+'_nunique1'] +3)
亮点六
模型
LightGBM
#lgb
train_df = data[data['target']>=0]
test = data[data['target']<0]
features = [c for c in data.columns if c not in ['target','id']]
target = train_df['target']
param = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.01,
'num_leaves': 2 ** 5 - 1,
'min_child_samples':50,
'max_bin':50,
'min_child_weight': 0,
'scale_pos_weight': 15,
'feature_fraction_seed':2019 ,
'max_depth':2,
'nthread': 4,
'verbose': 0,
'lambda_l1': 1,
'lambda_l2': 0
}
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2020)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
print("Fold {}".format(fold_))
trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
num_round = 1000000
clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
#画图
cols = (feature_importance_df[["Feature", "importance"]]
.groupby("Feature")
.mean()
.sort_values(by="importance", ascending=False)[:150].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]
plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('F:/26.png')
sub_df = pd.DataFrame({"id":test["id"].values})
sub_df["target"] = predictions.round(6)
sub_df.to_csv("F:/submission27.csv", index=False)
另有两个xgb。
三个模型通过加权融合。
sub['target'] = lgb['target']*0.06 + xgb['target']*0.09 + xgb2['target'] *0.85