1 任务
使用网格搜索法对7个模型进行调优(调参时采用五折交叉验证的方式),并进行模型评估.
2 不同模型调参前后的性能
模型 | 默认参数下的roc_auc_score | 调整参数后的roc_auc_score | 调整的参数 |
---|---|---|---|
Logistic Regression | 0.766 | 0.767 | {['solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],'max_iter':[10,50,100,150]}] |
决策树 | 0.594 | 0.656 | [{'max_depth': range(6, 10)}] |
SVC | 0.753 | 0.776 | [{'kernel': ['linear','poly','rbf','sigmoid']}] |
随机森林 | 0.720 | 0.749 | {'n_estimators': range(100,105)} |
GBDT | 0.764 | ||
xgboost | 0.771 | ||
LightGBM | 0.761 |
3 问题
- 网格搜索找出来的参数,是根据验证集的表现,在测试集上不一定表现提高
- 对划分之后的训练集用
StandardScaler()
, 而不是划分之前, 分类器效果更好.
4 完整代码及注释
# -*- coding: utf-8 -*-
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
# 引入要用到的评价函数
from sklearn.metrics import roc_curve, roc_auc_score ,make_scorer
# 引入用到的分类算法
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
def cal_roc_auc(model,x_test,y_test):
test_predict=model.predict(x_test)
if hasattr(model,'decision_function'):
pre_test=model.decision_function(x_test)
else:
pre_test=model.predict_proba(x_test)[:,1]
return roc_auc_score(y_test,pre_test)
# 读取数据
data_all = pd.read_csv('data_all.csv')
# print("数据行列数:",data_all.shape)
# print("每列数据的特征名称", data_all.keys())
# 划分数据集
df_y = data_all['status']
df_X = data_all.drop(columns=['status']).astype("float")
df_X = scale(df_X, axis=0, copy=True)
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3,random_state=2018)
# print("训练集数量:", X_train.shape[0], "测试集数量:", X_test.shape[0])
# print("各类样本数量:\n",y_train.value_counts())
# 依次训练不同分类器
# LogisticRegression
lr = LogisticRegression(random_state =2018)
lr.fit(X_train, y_train)
# 决策树
dt = DecisionTreeClassifier(random_state=2018)
dt.fit(X_train, y_train)
# SVM
svc = SVC(random_state=2018)
svc.fit(X_train, y_train)
# 随机森林
rfc = RandomForestClassifier(random_state=2018)
rfc.fit(X_train, y_train)
# GBDT
gbc = GradientBoostingClassifier(random_state=2018)
gbc.fit(X_train, y_train)
# sgboost
xgbc = XGBClassifier(random_state=2018)
xgbc.fit(X_train, y_train)
# lightgbm
lgbc = LGBMClassifier(random_state=2018)
lgbc.fit(X_train, y_train)
# 依次对训练好的多个模型计算roc_auc_score
model_name = ["lr", "dt", "svc", "rfc", "gbc", "xgbc", "lgbc"]
for name in model_name:
model = eval(name)
print(name)
print(cal_roc_auc(model, X_test, y_test))
# 该函数用网格搜索和打印相关结果
def gridsearch(model, parameters):
# 设置gridsearch的评价标准
scoring_fnc = make_scorer(roc_auc_score)
# 做5折交叉验证
kfold = KFold(n_splits=5, random_state=2018)
start_time = time.time()
grid = GridSearchCV(model, parameters, scoring_fnc, cv=kfold)
grid = grid.fit(X_train, y_train)
end_time = time.time()
# 选取最优模型
reg = grid.best_estimator_
print(grid.best_score_)
print(grid.best_params_)
print(cal_roc_auc(reg, X_test, y_test))
print("time: ", end_time-start_time)
parameters_name = ["parameters_" + x for x in model_name]
# 每个算法均选了若干参数,可以选择多个参数
parameters_lr = [{
'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'max_iter':[10,50,100,150]}]
parameters_dt = [{'max_depth': range(6, 10)}]
parameters_svc = [{'kernel': ['linear','poly','rbf','sigmoid']}]
parameters_rfc = [{'n_estimators': range(100,105)}]
parameters_gbc = {'n_estimators': range(50,120,10),'learning_rate': np.arange(0.1, 1, 0.1)}
parameters_xgbc = {'max_depth': range(3, 5)}
parameters_lgbc = {'n_estimators': range(100,110)}
# 选取一个算法进行网格搜索
model_idx = 3
model = eval(model_name[model_idx])
parameters = eval(parameters_name[model_idx])
print(model_name[model_idx])
gridsearch(model,parameters)