使用网格搜索法对7个模型进行调优(调参时采用五折交叉验证的方式),并进行模型评估
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings(action ='ignore', category = DeprecationWarning)
df = pd.read_csv('data_all.csv')
df.head()
y = df['status']
x = df.drop('status', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)
lr = LogisticRegression()
lr.fit(x_train,y_train)
svm = SVC(probability=True)
svm.fit(x_train,y_train)
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
gbdt=GradientBoostingClassifier()
gbdt.fit(x_train,y_train)
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
gbm=LGBMClassifier()
gbm.fit(x_train, y_train)
xgbc=XGBClassifier()
xgbc.fit(x_train, y_train)
def gride_search(model,para):
grid = GridSearchCV(model,para,cv = 5,scoring = 'accuracy')
grid = grid.fit(x_train,y_train)
if hasattr(model,'decision_function'):
y_predict_pro = grid.decision_function(x_test)
y_default_predict_pro = model.decision_function(x_test)
else:
y_predict_pro = grid.predict_proba(x_test)[:,1]
y_default_predict_pro = model.predict_proba(x_test)[:,1]
print('参数调整前后对比:')
print('best score:',grid.best_score_)
print('最优参数:',grid.best_params_)
print('默认参数 AUC:', roc_auc_score(y_test,y_default_predict_pro))
print('最优参数 AUC:', roc_auc_score(y_test,y_predict_pro))
para = {'penalty':['l1','l2'],'C':[1e-3, 1e-2, 1e-1, 1, 10]}
print('LogisticRegression: ')
gride_search(lr,para)
para = {'C':[1e-3, 1e-2, 1e-1, 1],'kernel':['linear','sigmoid']}
print('svm: ')
gride_search(svm,para)
para = {'criterion':['gini','entropy'],'splitter':['best','random'],
'max_depth':range(3,10,3),'max_features':['sqrt','log2',None]}
print('DecisionTreeClassifier: ')
gride_search(dt,para)
para = {'max_features':['sqrt','log2',None],'learning_rate':[0.01,0.1,1],
'n_estimators':range(50,200,50)}
print('GradientBoostingClassifier: ')
gride_search(gbdt,para)
para={'n_estimators':[20,50,100],'criterion':['gini','entropy'],
'max_depth':range(3,10,3),'max_features':['sqrt','log2',None]}
print('RandomForestClassifier: ')
gride_search(rfc,para)
para = {'learning_rate': [0.2,0.5,0.7], 'max_depth': range(1,10,3),
'n_estimators':range(20,100,20)}
print('lgb: ')
gride_search(gbm,para)
para = {'n_estimators':range(50,200,50),'max_depth':[3,6,10],
'reg_lambda':[0.2,0.5,1]}
print('xgb: ')
gride_search(xgbc,para)
LogisticRegression:
参数调整前后对比:
best score: 0.7938082356477307
最优参数: {‘C’: 0.1, ‘penalty’: ‘l1’}
默认参数 AUC: 0.5674548527432631
最优参数 AUC: 0.7706461978237509
DecisionTreeClassifier:
参数调整前后对比:
best score: 0.7730688307784791
最优参数: {‘criterion’: ‘gini’, ‘max_depth’: 6, ‘max_features’: None, ‘splitter’: ‘random’}
默认参数 AUC: 0.5751828320449022
最优参数 AUC: 0.7053561182226951
GradientBoostingClassifier:
参数调整前后对比:
best score: 0.7965133754132853
最优参数: {‘learning_rate’: 0.1, ‘max_features’: None, ‘n_estimators’: 50}
默认参数 AUC: 0.7623965864396524
最优参数 AUC: 0.7674720666019844
RandomForestClassifier:
参数调整前后对比:
best score: 0.7932070934776074
最优参数: {‘criterion’: ‘gini’, ‘max_depth’: 6, ‘max_features’: None, ‘n_estimators’: 50}
默认参数 AUC: 0.7037573158899566
最优参数 AUC: 0.7594298039706633
lgb:
参数调整前后对比:
best score: 0.7962128043282236
最优参数: {‘learning_rate’: 0.2, ‘max_depth’: 1, ‘n_estimators’: 60}
默认参数 AUC: 0.7574019592501017
最优参数 AUC: 0.7777234411025216
xgb:
参数调整前后对比:
best score: 0.7968139464983469
最优参数: {‘max_depth’: 3, ‘n_estimators’: 50, ‘reg_lambda’: 0.5}
默认参数 AUC: 0.7713634419371329
最优参数 AUC: 0.7705392632468467
SVM没跑出来!等跑出来我再加上!