1.通过学习曲线调参
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
def Rfc():
data = load_breast_cancer()
scorel = []
for i in range(0, 200, 10):
rfc = RandomForestClassifier(n_estimators=i+1,
n_jobs=-1,
random_state=10)
score = cross_val_score(rfc, data.data, data.target, cv=10).mean()
scorel.append(score)
print(max(scorel), (scorel.index(max(scorel))*10)+1)
plt.figure(figsize=[20, 5])
plt.plot(range(0, 200, 10), scorel)
plt.show()
def Rfc_1():
data = load_breast_cancer()
scorel = []
for i in range(65, 75):
rfc = RandomForestClassifier(n_estimators=i,
n_jobs=-1,
random_state=10)
score = cross_val_score(rfc, data.data, data.target, cv=10).mean()
scorel.append(score)
print(max(scorel), [*range(65, 75)][(scorel.index(max(scorel)))])
# plt.figure(figsize=[20, 5])
# plt.plot(range(0, 200, 10), scorel)
# plt.show()
if __name__ == "__main__":
Rfc()
Rfc_1()
上述代码的意思时,先用200颗树训练随机森林,训练完成后打印一下训练20次的分数并存放到列表中,画一个学习曲线图,发现在训练到71棵树时,平均分值最高。
那么我们再进行细化,原来是每次训练增加十颗,我们在将范围缩小后,确定范围在65-75每增加一棵树训练一次,这样最终确定在第69棵树的时候分支达到最高。