随机森林在乳腺癌数据上的调参
from sklearn. datasets import load_breast_cancer
from sklearn. ensemble import RandomForestClassifier
from sklearn. model_selection import GridSearchCV
from sklearn. model_selection import cross_val_score
import matplotlib. pyplot as plt
import pandas as pd
import numpy as np
data = load_breast_cancer( )
rfc = RandomForestClassifier( n_estimators= 100 , random_state= 90 )
score_pre = cross_val_score( rfc, data. data, data. target, cv= 10 ) . mean( )
scorel = [ ]
for i in range ( 0 , 200 , 10 ) :
rfc = RandomForestClassifier( n_estimators= i+ 1 ,
n_jobs= - 1 ,
random_state= 90 )
score = cross_val_score( rfc, data. data, data. target, cv= 10 ) . mean( )
scorel. append( score)
print ( max ( scorel) , ( scorel. index( max ( scorel) ) * 10 ) + 1 )
plt. figure( figsize= [ 20 , 5 ] )
plt. plot( range ( 1 , 201 , 10 ) , scorel)
plt. show( )
scorel = [ ]
for i in range ( 35 , 45 ) :
rfc = RandomForestClassifier( n_estimators= i,
n_jobs= - 1 ,
random_state= 90 )
score = cross_val_score( rfc, data. data, data. target, cv= 10 ) . mean( )
scorel. append( score)
print ( max ( scorel) , ( [ * range ( 35 , 45 ) ] [ scorel. index( max ( scorel) ) ] ) )
plt. figure( figsize= [ 20 , 5 ] )
plt. plot( range ( 35 , 45 ) , scorel)
plt. show( )
param_grid = { 'max_depth' : np. arange( 1 , 20 , 1 ) }
rfc = RandomForestClassifier( n_estimators= 39
, random_state= 90
)
GS = GridSearchCV( rfc, param_grid, cv= 10 )
GS. fit( data. data, data. target)
GS. best_params_
GS. best_score_
param_grid = { 'max_features' : np. arange( 5 , 30 , 1 ) }
"""
max_features是唯一一个即能够将模型往左(低方差高偏差)推,也能够将模型往右(高方差低偏差)推的参数。我
们需要根据调参前,模型所在的位置(在泛化误差最低点的左边还是右边)来决定我们要将max_features往哪边调。
现在模型位于图像左侧,我们需要的是更高的复杂度,因此我们应该把max_features往更大的方向调整,可用的特征
越多,模型才会越复杂。max_features的默认最小值是sqrt(n_features),因此我们使用这个值作为调参范围的
最小值。
"""
rfc = RandomForestClassifier( n_estimators= 39
, random_state= 90
)
GS = GridSearchCV( rfc, param_grid, cv= 10 )
GS. fit( data. data, data. target)
GS. best_params_
GS. best_score_
param_grid= { 'min_samples_leaf' : np. arange( 1 , 1 + 10 , 1 ) }
rfc = RandomForestClassifier( n_estimators= 39
, random_state= 90
)
GS = GridSearchCV( rfc, param_grid, cv= 10 )
GS. fit( data. data, data. target)
GS. best_params_
GS. best_score_
param_grid= { 'min_samples_split' : np. arange( 2 , 2 + 20 , 1 ) }
rfc = RandomForestClassifier( n_estimators= 39
, random_state= 90
)
GS = GridSearchCV( rfc, param_grid, cv= 10 )
GS. fit( data. data, data. target)
GS. best_params_
GS. best_score_
param_grid = { 'criterion' : [ 'gini' , 'entropy' ] }
rfc = RandomForestClassifier( n_estimators= 39
, random_state= 90
)
GS = GridSearchCV( rfc, param_grid, cv= 10 )
GS. fit( data. data, data. target)
GS. best_params_
GS. best_score_
rfc = RandomForestClassifier( n_estimators= 39 , random_state= 90 )
score = cross_val_score( rfc, data. data, data. target, cv= 10 ) . mean( )
score
score - score_pre