目录
1.任务描述
使用网格搜索法对7个模型进行调优(调参时采用五折交叉验证的方式),并进行模型评估,记得展示代码的运行结果。
2.实现和结果对比
- 包的导入,和数据预处理
from pandas import Series,DataFrame
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
#数据读入
data = pd.read_csv(r'data_all.csv')
#划分训练及和测试集
X = data.drop(['status'],axis = 1)
y = data['status']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 2018)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
- 网格搜索函数
#网格搜索 5折交叉验证。
def gride_search(model,para):
grid = GridSearchCV(model,para,cv = 5,scoring = 'accuracy',n_jobs = 4)
grid = grid.fit(X_train,y_train)
if hasattr(model,'decision_function'):
y_predict_pro = grid.decision_function(X_test)
y_default_predict_pro = model.decision_function(X_test)
else:
y_predict_pro = grid.predict_proba(X_test)[:,1]
y_default_predict_pro = model.predict_proba(X_test)[:,1]
print '参数调整前后对比:'
print 'best score:',grid.best_score_
print '默认参数:', model.get_params
print '最优参数:',grid.best_params_
print '默认参数 训练误差:', model.score(X_train,y_train)
print '最优参数 训练误差:', grid.score(X_train,y_train)
print '默认参数 测试误差:', model.score(X_test,y_test)
print '最优参数 测试误差:', grid.score(X_test,y_test)
print '默认参数 AUC:', roc_auc_score(y_test,y_default_predict_pro)
print '最优参数 AUC:', roc_auc_score(y_test,y_predict_pro)
-
SVM
实现:
#SVM clf_svm = svm.SVC(random_state = 2018) clf_svm.fit(X_train,y_train) #调整参数:C, kernel, gamma para = {'C':[1e-2, 1e-1, 1, 10],'kernel':['linear','poly','rbf','sigmoid']} gride_search(clf_svm,para)
结果对比输出:
参数调整前后对比: best score: 0.795311091073 默认参数: <bound method SVC.get_params of SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=2018, shrinking=True, tol=0.001, verbose=False)> 最优参数: {'kernel': 'linear', 'C': 0.1} 默认参数 训练误差: 0.842801322513 最优参数 训练误差: 0.8028253682 默认参数 测试误差: 0.780658724597 最优参数 测试误差: 0.777855641205 默认参数 AUC: 0.753124576174 最优参数 AUC: 0.777041407155
AUC:0.7531 —> 0.7770
-
决策树
实现:
#决策树 clf_tree = tree.DecisionTreeClassifier(random_state = 2018) clf_tree.fit(X_train,y_train) #调整参数 criterion,splitter,max_features ,max_depth, para = {'criterion':['gini','entropy'],'splitter':['best','random'], 'max_depth':[2,4,6,8],'max_features':['sqrt','log2',None]} gride_search(clf_tree,para)
结果输出:
参数调整前后对比: best score: 0.770363691013 默认参数: <bound method DecisionTreeClassifier.get_params of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=2018, splitter='best')> 最优参数: {'max_features': None, 'splitter': 'random', 'criterion': 'entropy', 'max_depth': 6} 默认参数 训练误差: 1.0 最优参数 训练误差: 0.804328223625 默认参数 测试误差: 0.685353889278 最优参数 测试误差: 0.763139453399 默认参数 AUC: 0.595629505597 最优参数 AUC: 0.710688502186
AUC:0.5956 —> 0.7106
-
LR
实现:
#LR clf_lr = LogisticRegression(random_state=2018) clf_lr.fit(X_train,y_train) #调整参数penalty ,C para = {'penalty':['l1','l2'],'C':[1e-2, 1e-1, 1, 10]} gride_search(clf_lr,para)
结果输出:
参数调整前后对比: best score: 0.792906522393 默认参数: <bound method LogisticRegression.get_params of LogisticRegression( C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=2018, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)> 最优参数: {'penalty': 'l1', 'C': 0.1} 默认参数 训练误差: 0.804929365795 最优参数 训练误差: 0.801923654944 默认参数 测试误差: 0.787666433076 最优参数 测试误差: 0.789067974772 默认参数 AUC: 0.765742856249 最优参数 AUC: 0.779722596059
AUC:0.7657 —> 0.7797
-
随机森林
实现
#随机森林 clf_rf = RandomForestClassifier(random_state = 2018) clf_rf.fit(X_train,y_train) #调整参数 n_estimators ,criterion ,max_depth ,max_features para={'n_estimators':[10,20,50,100],'criterion':['gini','entropy'],'max_depth':[2,4,6,8,10,None], 'max_features':['sqrt','log2',None]} gride_search(clf_rf,para)
结果输出:
参数调整前后对比: best score: 0.798316801924 默认参数: <bound method RandomForestClassifier.get_params of RandomForestClassifier( bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=2018, verbose=0, warm_start=False)> 最优参数: {'max_features': 'sqrt', 'n_estimators': 100, 'criterion': 'gini', 'max_depth': None} 默认参数 训练误差: 0.984670874662 最优参数 训练误差: 1.0 默认参数 测试误差: 0.768745620182 最优参数 测试误差: 0.779957953749 默认参数 AUC: 0.719929214526 最优参数 AUC: 0.748973688878
AUC:0.7199 —> 0.7489
- GBDT
实现:
#GBDT clf_gbdt = GradientBoostingClassifier(random_state=2018) clf_gbdt.fit(X_train,y_train) #调整参数 max_features ,learning_rate ,n_estimators ,subsample para = {'max_features':['sqrt','log2',None],'learning_rate':[0.01,0.1,0.5,1], 'n_estimators':range(20,200,20),'subsample':[0.2,0.5,0.7,1.0]} gride_search(clf_gbdt,para)
结果输出:
参数调整前后对比: best score: 0.798016230839 默认参数: <bound method GradientBoostingClassifier.get_params of GradientBoostingClassifier( criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, presort='auto', random_state=2018, subsample=1.0, verbose=0, warm_start=False)> 最优参数: {'max_features': 'sqrt', 'n_estimators': 80, 'learning_rate': 0.1, 'subsample': 0.7} 默认参数 训练误差: 0.862338443042 最优参数 训练误差: 0.835287045386 默认参数 测试误差: 0.780658724597 最优参数 测试误差: 0.787666433076 默认参数 AUC: 0.76334856499 最优参数 AUC: 0.775106152129
AUC:0.7633 —> 0.7751
-
XGB
实现:
#XGBoost clf_xgb = xgb.XGBClassifier() clf_xgb.fit(X_train,y_train) #调整参数 n_estimators ,max_depth ,lambda para = {'n_estimators':range(20,200,20),'max_depth':[2,6,10],'reg_lambda':[0.2,0.5,1]} gride_search(clf_xgb,para)
结果输出:
参数调整前后对比: best score: 0.794709948903 默认参数: <bound method XGBClassifier.get_params of XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, nthread=-1, objective='binary:logistic', reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1)> 最优参数: {'n_estimators': 60, 'reg_lambda': 1, 'max_depth': 2} 默认参数 训练误差: 0.851217312894 最优参数 训练误差: 0.815148782687 默认参数 测试误差: 0.791871058164 最优参数 测试误差: 0.78976874562 默认参数 AUC: 0.770729658957 最优参数 AUC: 0.777166598854
AUC:0.7707 —> 0.7772
-
LightGBM
实现:
#lightGBM clf_lgb = lgb.LGBMClassifier(random_state=2018) clf_lgb.fit(X_train,y_train) #调整参数 learning_rate,max_depth,n_estimators para = {'learning_rate': [0.2,0.5,0.7], 'max_depth': range(1,10,2), 'n_estimators':range(20,100,10)} gride_search(clf_lgb,para)
结果输出:
参数调整前后对比: best score: 0.797715659754 默认参数: <bound method LGBMClassifier.get_params of LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=2018, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0)> 最优参数: {'n_estimators': 90, 'learning_rate': 0.2, 'max_depth': 1} 默认参数 训练误差: 0.995792004809 最优参数 训练误差: 0.810940787496 默认参数 测试误差: 0.770147161878 最优参数 测试误差: 0.787666433076 默认参数 AUC: 0.753521016557 最优参数 AUC: 0.778567180996
AUC:0.7535 —> 0.7786
3.问题
1)对于XGBoost需要把gride_search()函数中的n_jobs设为1才可以正常运行。why?
2)各个参数调整后,AUC值均有不同程度的提高,但是效果不明显,如何进行进一步调参?