任务4:模型调优
使用网格搜索法对7个模型进行调优(调参时采用五折交叉验证的方式),并进行模型评估。
网格搜索(Grid Search)用简答的话来说就是手动的给出一个模型中你想要改动的所用的参数,程序自动的帮你使用穷举法来将所用的参数都运行一遍。决策树中我们常常将最大树深作为需要调节的参数;
K次验证:
#1. 导入所需包
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
%matplotlib inline
#2.划分数据集并归一化
data_original=pd.read_csv('data_all.csv')
data_original.head(5)
data_original.describe()
data=data_original.copy()
#将数据切分成标签y和特征X
y=data_original['status'].copy()
X=data_original.drop(['status'],axis=1).copy()
print("the X shape is:", X.shape)
print("the X shape is:" ,y.shape)
print("the nums of label 1 in y are",len(y[y==1]))
print("the nums of label 0 in y are",len(y[y==0]))
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2018)
print('the proportition of label 1 in y_test: %.2f%%'%(len(y_test[y_test==1])/len(y_test)*100))
#数据标准化
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.fit_transform(X_test)
the X shape is: (4754, 84)
the X shape is: (4754,)
the nums of label 1 in y are 1193
the nums of label 0 in y are 3561
the proportition of label 1 in y_test: 25.16%
# 3.构建模型进行网格搜索,由于对模型参数不熟练,只好边看文档边选
parameters_lr={'solver':['newton-cg','lbfgs','liblinear','sag'],'C':[0.1,1,10]}
lr_model=GridSearchCV(LogisticRegression(class_weight='balanced',max_iter=10000),parameters_lr,cv=5,scoring='roc_auc')
parameters_svm={'kernel':['linear','rbf','poly'],'C':[0.1,1,10]}
svm_model=GridSearchCV(SVC(class_weight='balanced',gamma='auto',probability=True),parameters_svm,cv=5,scoring='roc_auc')
parameters_dt={'criterion':['gini','entropy'],'max_features':['sqrt','log2',None]}
dt_model=GridSearchCV(DecisionTreeClassifier(class_weight='balanced'),parameters_dt,cv=5,scoring='roc_auc')
parameters_en={'n_estimators':range(10,100,10)}
rf_model=GridSearchCV(RandomForestClassifier(class_weight='balanced'),parameters_en,cv=5,scoring='roc_auc')
gbdt_model=GridSearchCV(GradientBoostingClassifier(),parameters_en,cv=5,scoring='roc_auc')
xgb_model=GridSearchCV(XGBClassifier(),parameters_en,cv=5,scoring='roc_auc')
lgbm_model=GridSearchCV(LGBMClassifier(),parameters_en,cv=5,scoring='roc_auc')
models={'LR':lr_model,
'SVM':svm_model,
'DT':dt_model,
'RF':rf_model,
'GBDT':gbdt_model,
'XGBoost':xgb_model,
'LGBM':lgbm_model}
#4.定义评估模型函数
df_result=pd.DataFrame(columns=('model','dataset','accuracy','precision','recall','f1_score','auc'))
row=0
def evaluate(y_pre,y,y_proba):
acc=accuracy_score(y,y_pre)
p=precision_score(y,y_pre)
r=recall_score(y,y_pre)
f1=f1_score(y,y_pre)
fpr,tpr,thresholds=roc_curve(y,y_proba[:,1])
model_auc=auc(fpr,tpr)
return acc,p,r,f1,fpr,tpr,model_auc
def plot_roc_curve(fpr,tpr,label=None):
#plt.figure(figsize=(8,6))
plt.plot(fpr,tpr,label=label)
plt.plot([0,1],[0,1],'k--')
plt.axis([0,1,0,1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Poisitive Rate')
plt.legend()
# 5.训练模型并做评估
#plt.figure(figsize=(8,6))
for name,model in models.items():
print(name,'start training...')
model.fit(X_train,y_train)
print(model.best_params_)
y_pred_test=model.predict(X_test)
y_proba_test=model.predict_proba(X_test)
acc,p,r,f1,fpr_test,tpr_test,auc_test=evaluate(y_pred_test,y_test,y_proba_test)
df_result.loc[row]=[name,'test',acc,p,r,f1,auc_test]
row+=1
y_pred_train=model.predict(X_train)
y_proba_train=model.predict_proba(X_train)
acc,p,r,f1,fpr_train,tpr_train,auc_train=evaluate(y_pred_train,y_train,y_proba_train)
df_result.loc[row]=[name,'train',acc,p,r,f1,auc_train]
row+=1
plot_roc_curve(fpr_test,tpr_test,label=name)
#plot_roc_curve(fpr_train,tpr_train,label=name)
print(df_result)
plt.show()
LR start training...
{'C': 1, 'solver': 'newton-cg'}
SVM start training...
{'C': 1, 'kernel': 'linear'}
DT start training...
{'criterion': 'entropy', 'max_features': 'sqrt'}
RF start training...
{'n_estimators': 80}
GBDT start training...
{'n_estimators': 50}
XGBoost start training...
{'n_estimators': 50}
LGBM start training...
{'n_estimators': 30}
model dataset accuracy precision recall f1_score auc
0 LR test 0.701472 0.437616 0.654596 0.524554 0.751703
1 LR train 0.764653 0.522467 0.711031 0.602336 0.823539
2 SVM test 0.689559 0.423913 0.651811 0.513721 0.743370
3 SVM train 0.763150 0.519896 0.720624 0.604020 0.821762
4 DT test 0.700771 0.395706 0.359331 0.376642 0.587437
5 DT train 1.000000 1.000000 1.000000 1.000000 1.000000
6 RF test 0.766643 0.654762 0.153203 0.248307 0.762133
7 RF train 1.000000 1.000000 1.000000 1.000000 1.000000
8 GBDT test 0.780659 0.638554 0.295265 0.403810 0.763197
9 GBDT train 0.835287 0.837264 0.425659 0.564388 0.880471
10 XGBoost test 0.789068 0.662921 0.328691 0.439479 0.768588
11 XGBoost train 0.831981 0.832930 0.412470 0.551724 0.881231
12 LGBM test 0.780659 0.622340 0.325905 0.427788 0.758161
13 LGBM train 0.908626 0.964912 0.659472 0.783476 0.968270
将每个模型的最好参数打印出来,可以发现:每个模型的最优参数各不相同,这也就要求我们在训练模型的时候耐心的找准参数。即使相同的数据集在不同的模型下,也会有不同的最优参数。