import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor,LinearRegression,Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
train=pd.read_csv('train.csv')
testA=pd.read_csv('testA.csv')
train.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
list(train.select_dtypes('object'))
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
data=pd.concat([train,testA],axis=0,ignore_index=True)
- concat函数:https://www.jianshu.com/p/421f040dfe2f
数据预处理
可以看到很多变量不能直接训练,比如’grade’, ‘subGrade’,‘employmentLength’, ‘issueDate’, ‘earliesCreditLine’,需要进行预处理
print(sorted(data.grade.unique()))
print(sorted(data.subGrade.unique()))
['A', 'B', 'C', 'D', 'E', 'F', 'G']
['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']
data['employmentLength'].value_counts(dropna=False).sort_index()
1 year 65671
10+ years 328525
2 years 90565
3 years 80163
4 years 59818
5 years 62645
6 years 46582
7 years 44230
8 years 45168
9 years 37866
< 1 year 80226
NaN 58541
Name: employmentLength, dtype: int64
- 首先对employmentLength进行转换到数值
data['employmentLength'].replace('10+ years','10 years',inplace=True)
data['employmentLength'].replace('< 1 year','0 years',inplace=True)
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
data['employmentLength']=data['employmentLength'].apply(employmentLength_to_int)
- https://blog.csdn.net/weixin_41712499/article/details/82719987
- https://blog.csdn.net/kai123wen/article/details/99321824
data['employmentLength'].value_counts(dropna=False).sort_index()
#dropna=False 表示不删除NaN
0.0 80226
1.0 65671
2.0 90565
3.0 80163
4.0 59818
5.0 62645
6.0 46582
7.0 44230
8.0 45168
9.0 37866
10.0 328525
NaN 58541
Name: employmentLength, dtype: int64
- 对earliesCreditLine进行预处理
data.earliesCreditLine.sample(5)
618907 Nov-2004
145773 Oct-2001
21633 Mar-2005
697120 Sep-1990
815318 Feb-2004
Name: earliesCreditLine, dtype: object
data.earliesCreditLine=data.earliesCreditLine.apply(lambda s:int(s[-4:]))
data.earliesCreditLine.describe()
count 1000000.000000
mean 1998.688632
std 7.606231
min 1944.000000
25% 1995.000000
50% 2000.000000
75% 2004.000000
max 2015.000000
Name: earliesCreditLine, dtype: float64
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 47 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 1000000 non-null int64
1 loanAmnt 1000000 non-null float64
2 term 1000000 non-null int64
3 interestRate 1000000 non-null float64
4 installment 1000000 non-null float64
5 grade 1000000 non-null object
6 subGrade 1000000 non-null object
7 employmentTitle 999999 non-null float64
8 employmentLength 941459 non-null float64
9 homeOwnership 1000000 non-null int64
10 annualIncome 1000000 non-null float64
11 verificationStatus 1000000 non-null int64
12 issueDate 1000000 non-null object
13 isDefault 800000 non-null float64
14 purpose 1000000 non-null int64
15 postCode 999999 non-null float64
16 regionCode 1000000 non-null int64
17 dti 999700 non-null float64
18 delinquency_2years 1000000 non-null float64
19 ficoRangeLow 1000000 non-null float64
20 ficoRangeHigh 1000000 non-null float64
21 openAcc 1000000 non-null float64
22 pubRec 1000000 non-null float64
23 pubRecBankruptcies 999479 non-null float64
24 revolBal 1000000 non-null float64
25 revolUtil 999342 non-null float64
26 totalAcc 1000000 non-null float64
27 initialListStatus 1000000 non-null int64
28 applicationType 1000000 non-null int64
29 earliesCreditLine 1000000 non-null int64
30 title 999999 non-null float64
31 policyCode 1000000 non-null float64
32 n0 949619 non-null float64
33 n1 949619 non-null float64
34 n2 949619 non-null float64
35 n3 949619 non-null float64
36 n4 958367 non-null float64
37 n5 949619 non-null float64
38 n6 949619 non-null float64
39 n7 949619 non-null float64
40 n8 949618 non-null float64
41 n9 949619 non-null float64
42 n10 958367 non-null float64
43 n11 912673 non-null float64
44 n12 949619 non-null float64
45 n13 949619 non-null float64
46 n14 949619 non-null float64
dtypes: float64(35), int64(9), object(3)
memory usage: 358.6+ MB
- 类别特征处理
#部分类别特征
cate_features=['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
print(f,'类型数:',data[f].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 298101
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 935
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 47903
policyCode 类型数: 1
#类型数在2之上,又不是高维稀疏的
data=pd.get_dummies(data,columns=['grade','subGrade','homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
#高维类别特征需要进行转换
for f in ['employmentTitle','postCode','title']:
data[f+'_cnts']=data.groupby([f])['id'].transform('count')
data[f+'_rank']=data.groupby([f])['id'].rank(ascending=False).astype(int)
del data[f]
训练数据/测试数据准备
features=[f for f in data.columns if f not in ['id','issueDate','isDefault']]
train=data[data.isDefault.notnull()].reset_index(drop=True)
test=data[data.isDefault.isnull()].reset_index(drop=True)
x_train=train[features]
x_test=test[features]
y_train=train['isDefault']
- reset_index(drop=True):https://www.cnblogs.com/keye/p/11229863.html
- x_train, x_test, y_train,y_test的含义:https://blog.csdn.net/lichunxia516/article/details/107707336
模型训练
- 直接构建了一个函数,可以调用三种树模型,方便快捷
def cv_model(clf,train_x,train_y,test_x,clf_name):
folds=5
seed=2020
kf=KFold(n_splits=folds,shuffle=True,random_state=seed)
train=np.zeros(train_x.shape[0])# shape[0]:表示矩阵的行数
test=np.zeros(test_x.shape[0])
cv_scores=[]
for i,(train_index,valid_index) in enumerate(kf.split(train_x,train_y)):
print('******{}******'.format(str(i+1)))
trn_x,trn_y,val_x,val_y=train_x.iloc[train_index],train_y[train_index],train_x.iloc[valid_index],train_y[valid_index]
if clf_name=='lgb':
train_matrix=clf.Dataset(trn_x,label=trn_y)
valid_matrix=clf.Dataset(val_x,label=val_y)
params={'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2 ** 5,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': 2020,
'nthread': 28,
'n_jobs':24,
'silent': True,
'verbose': -1,}
model=clf.train(params,train_matrix,50000,valid_sets=[train_matrix,valid_matrix],verbose_eval=200,early_stopping_rounds=200)
val_pred=model.predict(val_x,num_iteration=model.best_iteration)
test_pred=model.predict(test_x,num_iteration=model.best_iteration)
# print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
if clf_name=='xgb':
train_matrix=clf.DMatrix(trn_x,label=trn_y)
valid_matrix=clf.DMatrix(val_x,label=val_y)
test_matrix=clf.DMatrix(test_x)
params={'booster':'gbtree',
'objective':'binary:logietic',
'eval_metric':'auc',
'gamma':1,
'min_child_weight':1.5,
'max_depth':5,
'lambda':10,
'subsample':0.7,
'colsample_bytree':0.7,
'colsample_bylevel':0.7,
'eta':0.04,
'tree_method':'exact',
'seed':2020,
'nthread':36,
'silent':True,
}
watchlist=[(train_matrix,'train'),(valid_matrix,'eval')]
model=clf.train(params,train_matrix,num_boost_round=50000,evals=watchlist,verbose_eval=200,early_stopping_rounds=200)
val_pred=model.predict(valid_matrix,ntree_limit=model.best_ntree_limit)
test_pred=model.predict(test_matrix,ntree_limit=model.best_ntree_limit)
if clf_name=='cat':
params={'learning_rate':0.05,'depth':5,'l2_leaf_reg':10,'bootstrap_type':'Bernoulli',
'od_type':'Iter','od_wait':50,'random_seed':11,'allow_writing_files':False}
model=clf(iterations=20000,**params)
model.fit(trn_x,trn_y,eval_set=(val_x,val_y),
cat_features=[],use_best_model=True,verbose=500)
val_pred=model.predict(val_x)
test_pred=model.predict(test_x)
train[valid_index]=val_pred
test=test_pred/kf.n_splits
cv_scores.append(roc_auc_score(val_y,val_pred))
print(cv_scores)
print('%s_scotrainre_list:'%clf_name,cv_scores)
print('%s_score_mean:'%clf_name,np.mean(cv_scores))
print('%s_score_std:'%clf_name,np.std(cv_scores))
return train,test
K折交叉验证:
- https://blog.csdn.net/weixin_39183369/article/details/78953653
- https://blog.csdn.net/qq_41076797/article/details/101123197
调参:
- https://blog.csdn.net/u012735708/article/details/83749703
- https://blog.csdn.net/iyuanshuo/article/details/80142730
def lgb_model(x_train,y_train,x_test):
lgb_train,lgb_test=cv_model(lgb,x_train,y_train,x_test,'lgb')
return lgb_train,lgb_test
def xgb_model(x_train,y_train,x_test):
xgb_train,xgb_test=cv_model(xgb,x_train,y_train,x_test,'xgb')
return xgb_train,xgb_test
def cat_model(x_train,y_train,x_test):
cat_train,cat_test=cv_model(CatBoostRegressor,x_train,y_train,x_test,'cat')
return cat_train,cat_test
lgb_train,lgb_test=lgb_model(x_train,y_train,x_test)
******1******
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.742898 valid_1's auc: 0.730406
[400] training's auc: 0.755553 valid_1's auc: 0.731185
[600] training's auc: 0.766567 valid_1's auc: 0.731421
[800] training's auc: 0.77656 valid_1's auc: 0.731297
Early stopping, best iteration is:
[658] training's auc: 0.769561 valid_1's auc: 0.731571
[0.7315707699391983]
******2******
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.743889 valid_1's auc: 0.726598
[400] training's auc: 0.756346 valid_1's auc: 0.727829
[600] training's auc: 0.767237 valid_1's auc: 0.728122
[800] training's auc: 0.777257 valid_1's auc: 0.728164
Early stopping, best iteration is:
[700] training's auc: 0.772432 valid_1's auc: 0.728318
[0.7315707699391983, 0.7283181812019169]
******3******
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.743204 valid_1's auc: 0.731376
[400] training's auc: 0.7554 valid_1's auc: 0.732444
[600] training's auc: 0.766372 valid_1's auc: 0.732822
[800] training's auc: 0.776228 valid_1's auc: 0.732611
Early stopping, best iteration is:
[620] training's auc: 0.767377 valid_1's auc: 0.732834
[0.7315707699391983, 0.7283181812019169, 0.732833858510838]
******4******
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.742844 valid_1's auc: 0.730001
[400] training's auc: 0.755185 valid_1's auc: 0.731181
[600] training's auc: 0.766741 valid_1's auc: 0.731697
[800] training's auc: 0.776848 valid_1's auc: 0.731685
Early stopping, best iteration is:
[722] training's auc: 0.773097 valid_1's auc: 0.731733
[0.7315707699391983, 0.7283181812019169, 0.732833858510838, 0.7317333003550207]
******5******
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.743219 valid_1's auc: 0.729179
[400] training's auc: 0.755904 valid_1's auc: 0.730599
[600] training's auc: 0.766513 valid_1's auc: 0.731059
[800] training's auc: 0.776506 valid_1's auc: 0.730971
Early stopping, best iteration is:
[735] training's auc: 0.773511 valid_1's auc: 0.731143
[0.7315707699391983, 0.7283181812019169, 0.732833858510838, 0.7317333003550207, 0.7311427854544066]
lgb_scotrainre_list: [0.7315707699391983, 0.7283181812019169, 0.732833858510838, 0.7317333003550207, 0.7311427854544066]
lgb_score_mean: 0.7311197790922761
lgb_score_std: 0.001507802995682687
#听说时间很久,那我就不跑了hh
#xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)
cat_train,cat_test=cat_model(x_train,y_train,x_test)
******1******
0: learn: 0.3985252 test: 0.3966187 best: 0.3966187 (0) total: 178ms remaining: 59m 12s
500: learn: 0.3771946 test: 0.3759285 best: 0.3759285 (500) total: 30.8s remaining: 19m 59s
1000: learn: 0.3756449 test: 0.3751634 best: 0.3751634 (1000) total: 1m 4s remaining: 20m 32s
1500: learn: 0.3745709 test: 0.3748276 best: 0.3748276 (1500) total: 1m 39s remaining: 20m 28s
2000: learn: 0.3736588 test: 0.3746263 best: 0.3746258 (1998) total: 2m 14s remaining: 20m 10s
2500: learn: 0.3728292 test: 0.3744849 best: 0.3744849 (2500) total: 2m 48s remaining: 19m 41s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3744018679
bestIteration = 2905
Shrink model to first 2906 iterations.
[0.7327200609336475]
******2******
0: learn: 0.3979537 test: 0.3988945 best: 0.3988945 (0) total: 126ms remaining: 42m 3s
500: learn: 0.3764995 test: 0.3787237 best: 0.3787237 (500) total: 41.2s remaining: 26m 42s
1000: learn: 0.3749374 test: 0.3779174 best: 0.3779174 (1000) total: 1m 19s remaining: 24m 59s
1500: learn: 0.3738552 test: 0.3775812 best: 0.3775812 (1500) total: 1m 54s remaining: 23m 34s
2000: learn: 0.3729340 test: 0.3773443 best: 0.3773436 (1998) total: 2m 31s remaining: 22m 46s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3773239679
bestIteration = 2052
Shrink model to first 2053 iterations.
[0.7327200609336475, 0.7282917118426803]
******3******
0: learn: 0.3980280 test: 0.3987527 best: 0.3987527 (0) total: 153ms remaining: 51m 8s
500: learn: 0.3767797 test: 0.3776461 best: 0.3776461 (500) total: 37.8s remaining: 24m 32s
1000: learn: 0.3752307 test: 0.3768433 best: 0.3768433 (1000) total: 1m 12s remaining: 22m 58s
1500: learn: 0.3741403 test: 0.3764607 best: 0.3764605 (1499) total: 1m 47s remaining: 22m 3s
2000: learn: 0.3732161 test: 0.3762495 best: 0.3762493 (1997) total: 2m 21s remaining: 21m 9s
2500: learn: 0.3723968 test: 0.3761103 best: 0.3761098 (2495) total: 2m 55s remaining: 20m 27s
3000: learn: 0.3716474 test: 0.3760186 best: 0.3760186 (3000) total: 3m 29s remaining: 19m 45s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3759560948
bestIteration = 3410
Shrink model to first 3411 iterations.
[0.7327200609336475, 0.7282917118426803, 0.7338399687776773]
******4******
0: learn: 0.3980748 test: 0.3983970 best: 0.3983970 (0) total: 129ms remaining: 42m 51s
500: learn: 0.3767830 test: 0.3777709 best: 0.3777709 (500) total: 36.3s remaining: 23m 33s
1000: learn: 0.3752528 test: 0.3769020 best: 0.3769020 (1000) total: 1m 25s remaining: 26m 53s
1500: learn: 0.3741987 test: 0.3765448 best: 0.3765448 (1500) total: 2m 20s remaining: 28m 54s
2000: learn: 0.3732910 test: 0.3763156 best: 0.3763156 (2000) total: 3m 16s remaining: 29m 23s
2500: learn: 0.3724645 test: 0.3761445 best: 0.3761435 (2498) total: 4m 10s remaining: 29m 15s
3000: learn: 0.3716982 test: 0.3760409 best: 0.3760409 (3000) total: 5m 6s remaining: 28m 58s
3500: learn: 0.3709615 test: 0.3759851 best: 0.3759842 (3495) total: 6m 2s remaining: 28m 28s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3759786172
bestIteration = 3597
Shrink model to first 3598 iterations.
[0.7327200609336475, 0.7282917118426803, 0.7338399687776773, 0.7325672923232748]
******5******
0: learn: 0.3981448 test: 0.3980859 best: 0.3980859 (0) total: 144ms remaining: 47m 51s
500: learn: 0.3767559 test: 0.3775909 best: 0.3775909 (500) total: 58.4s remaining: 37m 54s
1000: learn: 0.3752239 test: 0.3768122 best: 0.3768122 (1000) total: 1m 51s remaining: 35m 7s
1500: learn: 0.3741592 test: 0.3764654 best: 0.3764654 (1500) total: 2m 21s remaining: 28m 59s
2000: learn: 0.3732513 test: 0.3762308 best: 0.3762294 (1997) total: 2m 58s remaining: 26m 42s
2500: learn: 0.3724325 test: 0.3760785 best: 0.3760785 (2500) total: 3m 37s remaining: 25m 23s
3000: learn: 0.3716690 test: 0.3759789 best: 0.3759789 (3000) total: 4m 15s remaining: 24m 8s
3500: learn: 0.3709385 test: 0.3759029 best: 0.3759014 (3491) total: 4m 55s remaining: 23m 13s
4000: learn: 0.3702519 test: 0.3758301 best: 0.3758288 (3970) total: 5m 35s remaining: 22m 20s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3758115042
bestIteration = 4164
Shrink model to first 4165 iterations.
[0.7327200609336475, 0.7282917118426803, 0.7338399687776773, 0.7325672923232748, 0.7317952826099017]
cat_scotrainre_list: [0.7327200609336475, 0.7282917118426803, 0.7338399687776773, 0.7325672923232748, 0.7317952826099017]
cat_score_mean: 0.7318428632974363
cat_score_std: 0.0018918585561348224
rh_test=lgb_test*0.5+xgb_test*0.5
testA['isDefault']=rh_test
testA[['id','isDefault']].to_csv
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-49-81ca03ed5f9f> in <module>
----> 1 rh_test=lgb_test*0.5+xgb_test*0.5
2 testA['isDefault']=rh_test
3 testA[['id','isDefault']].to_csv
NameError: name 'xgb_test' is not defined