模型调参方法

最新推荐文章于 2024-05-09 15:35:47 发布

OrangeCat橘猫

最新推荐文章于 2024-05-09 15:35:47 发布

阅读量1.1k

点赞数

分类专栏：机器学习集成学习模型优化

原文链接：https://github.com/datawhalechina/team-learning-data-mining/tree/master/EnsembleLearning

版权

机器学习同时被 3 个专栏收录

11 篇文章 0 订阅

订阅专栏

集成学习

6 篇文章 0 订阅

订阅专栏

模型优化

5 篇文章 0 订阅

订阅专栏

模型参数调参

网格搜索GridSearchCV()
随机网格搜索RandomizedSearchCV()
类别为二分类时
实例：XGBoost调参

网格搜索GridSearchCV()

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import time

start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
param_range = [0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
param_grid = [{'svc__C':param_range,'svc__kernel':['linear']},{'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring='accuracy',cv=10,n_jobs=-1)
gs = gs.fit(X,y)
end_time = time.time()
print("网格搜索经历时间：%.3f S" % float(end_time-start_time))
print(gs.best_score_)
print(gs.best_params_)

网格搜索经历时间：4.300 S
0.9800000000000001
{‘svc__C’: 1.0, ‘svc__gamma’: 0.1, ‘svc__kernel’: ‘rbf’}

随机网格搜索RandomizedSearchCV()

from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import time

start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
param_range = [0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
param_grid = [{'svc__C':param_range,'svc__kernel':['linear']},{'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
# param_grid = [{'svc__C':param_range,'svc__kernel':['linear','rbf'],'svc__gamma':param_range}]
gs = RandomizedSearchCV(estimator=pipe_svc, param_distributions=param_grid,scoring='accuracy',cv=10,n_jobs=-1)
gs = gs.fit(X,y)
end_time = time.time()
print("随机网格搜索经历时间：%.3f S" % float(end_time-start_time))
print(gs.best_score_)
print(gs.best_params_)

随机网格搜索经历时间：0.942 S
0.9733333333333334
{‘svc__kernel’: ‘linear’, ‘svc__C’: 100.0}

类别为二分类时

# 混淆矩阵：
# 加载数据
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",header=None)
'''
乳腺癌数据集：569个恶性和良性肿瘤细胞的样本，M为恶性，B为良性
'''
# 做基本的数据预处理
from sklearn.preprocessing import LabelEncoder

X = df.iloc[:,2:].values
y = df.iloc[:,1].values
le = LabelEncoder()    #将M-B等字符串编码成计算机能识别的0-1
y = le.fit_transform(y)
le.transform(['M','B'])
# 数据切分8：2
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
from sklearn.svm import SVC
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
from sklearn.metrics import confusion_matrix

pipe_svc.fit(X_train,y_train)
y_pred = pipe_svc.predict(X_test)
confmat = confusion_matrix(y_true=y_test,y_pred=y_pred)
fig,ax = plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat, cmap=plt.cm.Blues,alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,y=i,s=confmat[i,j],va='center',ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

在这里插入图片描述
绘制ROC曲线

# 绘制ROC曲线：
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import make_scorer,f1_score
scorer = make_scorer(f1_score,pos_label=0)
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring=scorer,cv=10)
y_pred = gs.fit(X_train,y_train).decision_function(X_test)
#y_pred = gs.predict(X_test)
fpr,tpr,threshold = roc_curve(y_test, y_pred) ###计算真阳率和假阳率
roc_auc = auc(fpr,tpr) ###计算auc的值
plt.figure()
lw = 2
plt.figure(figsize=(7,5))
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假阳率为横坐标，真阳率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic ')
plt.legend(loc="lower right")
plt.show()

在这里插入图片描述

实例：XGBoost调参

import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

train_x, valid_x, train_y, valid_y = train_test_split(x_train, y_train, test_size=0.333, random_state=0)   # 分训练集和验证集
# 这里不需要Dmatrix

parameters = {
              'max_depth': [5, 10, 15, 20, 25],# 树的最大深度
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],# 学习率
              'n_estimators': [500, 1000, 2000, 3000, 5000],# 最大迭代次数
              'min_child_weight': [0, 2, 5, 10, 20],# 新分裂的节点样本权重停止分裂的最小阈值
              'max_delta_step': [0, 0.2, 0.6, 1, 2],# 叶子输出的最大步长
              'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],# 样本采样率
              'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],# 列采样率
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],# L2正则化
              'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],# L1正则化
              'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]# 样本不均衡时

}

xlf = xgb.XGBClassifier(max_depth=10,
            learning_rate=0.01,
            n_estimators=2000,
            silent=True,
            objective='binary:logistic',
            nthread=-1,
            gamma=0,
            min_child_weight=1,
            max_delta_step=0,
            subsample=0.85,
            colsample_bytree=0.7,
            colsample_bylevel=1,
            reg_alpha=0,
            reg_lambda=1,
            scale_pos_weight=1,
            seed=1440,
            missing=None)
            
# 有了gridsearch我们便不需要fit函数
gsearch = GridSearchCV(xlf, param_grid=parameters, scoring='accuracy', cv=3)
gsearch.fit(train_x, train_y)

print("Best score: %0.3f" % gsearch.best_score_)
print("Best parameters set:")
best_parameters = gsearch.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

OrangeCat橘猫

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
模型调参方法

模型参数调参网格搜索GridSearchCV()随机网格搜索RandomizedSearchCV()类别为二分类时实例：XGBoost调参网格搜索GridSearchCV()from sklearn.model_selection import GridSearchCVfrom sklearn.svm import SVCimport timestart_time = time.time()pipe_svc = make_pipeline(StandardScaler(),SVC(rando
复制链接

扫一扫