机器学习随机森林乳腺癌调参

泛化误差测试集上表现糟糕,泛化程度不够,误差大, 模型复杂度,模型太复杂过拟合
当模型太复杂,模型就会过拟合,泛化能力就不够,所以泛化误差大。当模型太简单,
模型就会欠拟合,拟合能力 就不够,所以误差也会大。只有当模型的复杂度刚刚好的才能够达到泛化误差小的目标。

1:导库

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

1.1 引入时间time模块计算各个测试花费时间

import time

starttime = time.time()
endtime = time.time()
dtime = endtime - starttime

1.2 加载数据集

data=load_breast_cancer()
data.data.shape
(569, 30)
data.target.shape
(569,)

1.3 简单建模,查看未进行调参时候的准确率,方便对比

rfc=RandomForestClassifier(n_estimators=100,random_state=12)
score_pre=cross_val_score(rfc,data.data,data.target,cv=10).mean()
score_pre
0.9578634085213033
import time

2 随机森林调整

2.1 第一步:无论如何先来调n_estimators n_estimators 它的影响权重最大,使用学习曲线,找到n_estimators取值平稳的范围

# 花费时间42.656270265579224
a=time.time()
# s_time=time.time()
scorel=[]
for i in range(0,200,10):
    rfc=RandomForestClassifier(n_estimators=i+1
                              ,random_state=12
                              ,n_jobs=-1)
    score_pre=cross_val_score(rfc,data.data,data.target,cv=10).mean()
    scorel.append(score_pre)
b=time.time()
sum_time=b-a
print(sum_time)
42.656270265579224
print(max(scorel),(scorel.index(max(scorel))*10)+1)
0.9648809523809524 81
plt.plot(range(0,200,10),scorel)

在这里插入图片描述

2.2确定好的范围进一步细化学习曲线,找到最好的点

  • 确定为80,左右各去5,在这个范围取值
a=time.time()
# s_time=time.time()
scorel=[]
for i in range(75,85):
    rfc=RandomForestClassifier(n_estimators=i+1
                              ,random_state=12
                              ,n_jobs=-1)
    score_pre=cross_val_score(rfc,data.data,data.target,cv=10).mean()
    scorel.append(score_pre)
b=time.time()
sum_time=b-a
print(sum_time)
20.36061978340149
plt.plot(range(75,85),scorel)
[<matplotlib.lines.Line2D at 0x125cdbb5a08>]

在这里插入图片描述

max(scorel)
0.9666353383458647
scorel.index(max(scorel))+75
#最好取值78
78

为网格搜索做准备,书写网格搜索的参数

2.3开始按照参数对模型整体准确率的影响程度进行调参,首先调整max_depth

param_grid = {'max_depth':np.arange(1, 20, 1)}
rfc=RandomForestClassifier(n_estimators=78
                           ,random_state=12)
GS=GridSearchCV(rfc,param_grid,cv=10)
GS.fit(data.data,data.target)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=85, n_jobs=-1,
                                              oob_score=False, random_state=12,
                                              verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
GS.best_params_
{'max_depth': 9}
GS.best_score_
0.9648809523809524

2.4 调整max_features

  • max_features是唯一一个即能够将模型往左(低方差高偏差)推,也能够将模型往右(高方差低偏差)推的参数
  • max_features的默认小值是sqrt(n_features),使用这个值作为调参范围的最小值。
a
param_grid={"max_features":np.arange(5,30,1)}
rfc=RandomForestClassifier(n_estimators=78
                           ,random_state=12)
GS=GridSearchCV(rfc,param_grid,cv=10)
GS.fit(data.data,data.target)
b
print(b-a)
20.36061978340149
GS.best_score_
0.968421052631579

2.5 调整min_samples_leaf

param_grid={'min_samples_leaf':np.arange(1, 1+10, 1)}
rfc=RandomForestClassifier(n_estimators=78
                           ,random_state=12)
GS=GridSearchCV(rfc,param_grid,cv=10)
GS.fit(data.data,data.target)
b
1597195996.2337263
GS.best_score_
0.9631265664160402

2.6 尝试min_samples_split

param_grid={'min_samples_split':np.arange(2, 2+20, 1)}
a
rfc=RandomForestClassifier(n_estimators=78
                           ,random_state=12)
GS=GridSearchCV(rfc,param_grid,cv=10)
GS.fit(data.data,data.target)
b
1597195996.2337263
GS.best_score_
0.9649122807017545
GS.score

<bound method BaseSearchCV.score of GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=78, n_jobs=None,
                                              oob_score=False, random_state=12,
                                              verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'min_samples_split': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)>

2.7 调参 criterion

param_grid = {'criterion':['gini', 'entropy']}
a
rfc=RandomForestClassifier(n_estimators=78
                           ,random_state=12)
GS=GridSearchCV(rfc,param_grid,cv=10)
GS.fit(data.data,data.target)
b
1597195996.2337263
GS.best_score_
0.9631265664160402

3 调整完毕,总结最好参数

rfc=RandomForestClassifier(n_estimators=78
                           ,random_state=12)
GS.fit(data.data,data.target)
GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=78, n_jobs=None,
                                              oob_score=False, random_state=12,
                                              verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
GS.best_score_
0.9631265664160402

  • 菜菜的sklearn学习得到 https://live.bilibili.com/12582510
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值