1 需求
1.1 验证曲线:
缺点:只能逐一对超参数进行调参
# 由于无法确定随机森林中的各个参数的最佳值,需要使用验证曲线进行选择
model = se.RandomForestClassifier(max_depth=6, n_estimators=200, random_state=7)
1.2 学习曲线
功能:确定最佳的训练集和测试集
1.3 网格搜索
功能:验证曲线只能每次获取一个最优超参数。如果多个超参数有很多排列组合的话,就可以使用网格搜索寻求最优超参数组合。
2. 所需api
2.1 验证曲线
import sklearn.model_selection as ms # 交叉验证
train_scores, test_scores = ms.validation_curve(
model, # 模型
输入集, 输出集,
'n_estimators', #超参数名
np.arange(50, 550, 50), #超参数序列
cv=5 #折叠数
)
2.2 学习曲线
import sklearn.model_selection as ms # 交叉验证
_, train_scores, test_scores = ms.learning_curve(
model, # 模型
输入集, 输出集,
[0.9, 0.8, 0.7], # 训练集大小序列
cv=5 # 折叠数
)
2.3 网格搜索
import sklearn.model_selection as ms
model = ms.GridSearchCV(模型, 超参数组合列表, cv=折叠数)
model.fit(输入集,输出集)
# 获取网格搜索每个参数组合
model.cv_results_['params']
# 获取网格搜索每个参数组合所对应的平均测试分值
model.cv_results_['mean_test_score']
# 获取最好的参数
model.best_params_
model.best_score_
model.best_estimator_
3. 举例
3.1 验证曲线
import sklearn.model_selection as ms # 交叉验证
#构建随机森林模型
model = se.RandomForestClassifier(max_depth=9, n_estimators=144, random_state=7)
# 验证曲线选择最优的n_estimators超参数
train_score, test_score = ms.validation_curve(
model, train_x, train_y, 'n_estimators',
np.arange(140, 150, 1), cv=5)
print(test_score.mean(axis=1))
#绘制验证曲线结果图
import matplotlib.pyplot as mp
mp.grid(linestyle= ':')
mp.plot(np.arange(140, 150, 1),test_score.mean(axis=1),'o-',
color = 'dodgerblue',label = 'n_estimators')
mp.legend()
mp.show()
# 验证曲线选择最优的max_depth超参数
train_score, test_score = ms.validation_curve(
model, train_x, train_y, 'max_depth',
np.arange(1, 11, 1), cv=5)
print(test_score.mean(axis=1))
#绘制验证曲线结果图
import matplotlib.pyplot as mp
mp.grid(linestyle= ':')
mp.plot(np.arange(1, 11, 1),test_score.mean(axis=1),'o-',
color = 'dodgerblue',label = 'max_depth')
mp.legend()
mp.show()
3.2学习曲线
import sklearn.model_selection as ms # 交叉验证
model = se.RandomForestClassifier(max_depth=9, n_estimators=144, random_state=7)
# 学习曲线
train_size = np.arange(0.1, 1.0, 0.1)
_, train_score, test_score = ms.learning_curve(
model, train_x, train_y,
train_sizes=train_size, cv=5
)
test_mean = test_score.mean(axis=1)
import matplotlib.pyplot as mp
mp.grid(linestyle=':')
mp.plot(train_size, test_mean, 'o-',
color='dodgerblue')
mp.legend()
mp.show()
网格搜索
# 基于径向基核函数的支持向量机分类器
params = [{'kernel':['linear'], 'C':[1, 10, 100, 1000]},
{'kernel':['poly'], 'C':[1], 'degree':[2, 3]},
{'kernel':['rbf'], 'C':[1,10,100,1000], 'gamma':[1, 0.1, 0.01, 0.001]}]
model = ms.GridSearchCV(svm.SVC(probability=True), params, cv=5)
model.fit(train_x, train_y)
for p, s in zip(model.cv_results_['params'],
model.cv_results_['mean_test_score']):
print(p, s)
# 获取得分最优的的超参数信息
print(model.best_params_)
# 获取最优得分
print(model.best_score_)
# 获取最优模型的信息
print(model.best_estimator_)