【一周算法实践】--4.模型调优

任务4:模型调优

使用网格搜索法对7个模型进行调优(调参时采用五折交叉验证的方式),并进行模型评估。

网格搜索(Grid Search)用简答的话来说就是手动的给出一个模型中你想要改动的所用的参数,程序自动的帮你使用穷举法来将所用的参数都运行一遍。决策树中我们常常将最大树深作为需要调节的参数;

K次验证:
在这里插入图片描述

#1. 导入所需包
import pandas as pd

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

%matplotlib inline
#2.划分数据集并归一化
data_original=pd.read_csv('data_all.csv')
data_original.head(5)
data_original.describe()
data=data_original.copy()
#将数据切分成标签y和特征X
y=data_original['status'].copy()
X=data_original.drop(['status'],axis=1).copy()
print("the X shape is:", X.shape)
print("the X shape is:" ,y.shape)
print("the nums of label 1 in y are",len(y[y==1]))
print("the nums of label 0 in y are",len(y[y==0]))

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2018)
print('the proportition of label 1 in y_test: %.2f%%'%(len(y_test[y_test==1])/len(y_test)*100))
#数据标准化
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.fit_transform(X_test)

the X shape is: (4754, 84)
the X shape is: (4754,)
the nums of label 1 in y are 1193
the nums of label 0 in y are 3561
the proportition of label 1 in y_test: 25.16%
# 3.构建模型进行网格搜索,由于对模型参数不熟练,只好边看文档边选
parameters_lr={'solver':['newton-cg','lbfgs','liblinear','sag'],'C':[0.1,1,10]}
lr_model=GridSearchCV(LogisticRegression(class_weight='balanced',max_iter=10000),parameters_lr,cv=5,scoring='roc_auc')

parameters_svm={'kernel':['linear','rbf','poly'],'C':[0.1,1,10]}
svm_model=GridSearchCV(SVC(class_weight='balanced',gamma='auto',probability=True),parameters_svm,cv=5,scoring='roc_auc')

parameters_dt={'criterion':['gini','entropy'],'max_features':['sqrt','log2',None]}
dt_model=GridSearchCV(DecisionTreeClassifier(class_weight='balanced'),parameters_dt,cv=5,scoring='roc_auc')

parameters_en={'n_estimators':range(10,100,10)}
rf_model=GridSearchCV(RandomForestClassifier(class_weight='balanced'),parameters_en,cv=5,scoring='roc_auc')
gbdt_model=GridSearchCV(GradientBoostingClassifier(),parameters_en,cv=5,scoring='roc_auc')
xgb_model=GridSearchCV(XGBClassifier(),parameters_en,cv=5,scoring='roc_auc')
lgbm_model=GridSearchCV(LGBMClassifier(),parameters_en,cv=5,scoring='roc_auc')
models={'LR':lr_model,
       'SVM':svm_model,
       'DT':dt_model,
       'RF':rf_model,
       'GBDT':gbdt_model,
       'XGBoost':xgb_model,
       'LGBM':lgbm_model}
#4.定义评估模型函数
df_result=pd.DataFrame(columns=('model','dataset','accuracy','precision','recall','f1_score','auc'))
row=0
def evaluate(y_pre,y,y_proba):
    acc=accuracy_score(y,y_pre)
    p=precision_score(y,y_pre)
    r=recall_score(y,y_pre)
    f1=f1_score(y,y_pre)
    fpr,tpr,thresholds=roc_curve(y,y_proba[:,1])
    model_auc=auc(fpr,tpr)
    return acc,p,r,f1,fpr,tpr,model_auc

def plot_roc_curve(fpr,tpr,label=None):
    #plt.figure(figsize=(8,6))
    plt.plot(fpr,tpr,label=label)
    plt.plot([0,1],[0,1],'k--')
    plt.axis([0,1,0,1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Poisitive Rate')
    plt.legend()
# 5.训练模型并做评估
#plt.figure(figsize=(8,6))
for name,model in models.items():
    print(name,'start training...')
    model.fit(X_train,y_train)
    print(model.best_params_)
    y_pred_test=model.predict(X_test)
    y_proba_test=model.predict_proba(X_test)
    acc,p,r,f1,fpr_test,tpr_test,auc_test=evaluate(y_pred_test,y_test,y_proba_test)
    df_result.loc[row]=[name,'test',acc,p,r,f1,auc_test]
    row+=1
    
    y_pred_train=model.predict(X_train)
    y_proba_train=model.predict_proba(X_train)
    acc,p,r,f1,fpr_train,tpr_train,auc_train=evaluate(y_pred_train,y_train,y_proba_train)
    df_result.loc[row]=[name,'train',acc,p,r,f1,auc_train]
    row+=1
    plot_roc_curve(fpr_test,tpr_test,label=name)
    #plot_roc_curve(fpr_train,tpr_train,label=name)
print(df_result)
plt.show()
LR start training...
{'C': 1, 'solver': 'newton-cg'}
SVM start training...
{'C': 1, 'kernel': 'linear'}
DT start training...
{'criterion': 'entropy', 'max_features': 'sqrt'}
RF start training...
{'n_estimators': 80}
GBDT start training...
{'n_estimators': 50}
XGBoost start training...
{'n_estimators': 50}
LGBM start training...



{'n_estimators': 30}
      model dataset  accuracy  precision    recall  f1_score       auc
0        LR    test  0.701472   0.437616  0.654596  0.524554  0.751703
1        LR   train  0.764653   0.522467  0.711031  0.602336  0.823539
2       SVM    test  0.689559   0.423913  0.651811  0.513721  0.743370
3       SVM   train  0.763150   0.519896  0.720624  0.604020  0.821762
4        DT    test  0.700771   0.395706  0.359331  0.376642  0.587437
5        DT   train  1.000000   1.000000  1.000000  1.000000  1.000000
6        RF    test  0.766643   0.654762  0.153203  0.248307  0.762133
7        RF   train  1.000000   1.000000  1.000000  1.000000  1.000000
8      GBDT    test  0.780659   0.638554  0.295265  0.403810  0.763197
9      GBDT   train  0.835287   0.837264  0.425659  0.564388  0.880471
10  XGBoost    test  0.789068   0.662921  0.328691  0.439479  0.768588
11  XGBoost   train  0.831981   0.832930  0.412470  0.551724  0.881231
12     LGBM    test  0.780659   0.622340  0.325905  0.427788  0.758161
13     LGBM   train  0.908626   0.964912  0.659472  0.783476  0.968270

在这里插入图片描述

将每个模型的最好参数打印出来,可以发现:每个模型的最优参数各不相同,这也就要求我们在训练模型的时候耐心的找准参数。即使相同的数据集在不同的模型下,也会有不同的最优参数。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值