在 2 个参数上使用 for 循环,对每种参数组合分别训练并评估一个分类器
# 简单的网格搜索实现
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=0)
print("Size of training set: {} size of test set: {}".format(
X_train.shape[0], X_test.shape[0]))
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
# 对每种参数组合都训练一个SVC
svm = SVC(gamma=gamma, C=C)
svm.fit(X_train, y_train)
# 在测试集上评估SVC
score = svm.score(X_test, y_test)
# 如果我们得到了更高的分数,则保存该分数和对应的参数 if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))
Size of training set: 112 size of test set: 38
Best score: 0.97
Best parameters: {‘C’: 100, ‘gamma’: 0.001}
(1)对数据进行 3 折划分,分为训练集、验证集和测试集
from sklearn.svm import SVC
# 将数据划分为训练+验证集与测试集
X_trainval, X_test, y_trainval, y_test = train_test_split(
iris.data, iris.target, random_state=0)
# 将训练+验证集划分为训练集与验证集
X_train, X_valid, y_train, y_valid = train_test_split(
X_trainval, y_trainval, random_state=1)
print("Size of training set: {} size of validation set: {} size of test set:"
" {}\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]))
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
# 对每种参数组合都训练一个SVC
svm = SVC(gamma=gamma, C=C)
svm.fit(X_train, y_train)
# 在验证集上评估SVC
score = svm.score(X_valid, y_valid)
# 如果我们得到了更高的分数,则保存该分数和对应的参数
if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
# 在训练+验证集上重新构建一个模型,并在测试集上进行评估
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print("Best score on validation set: {:.2f}".format(best_score))
print("Best parameters: ", best_parameters)
print("Test set score with best parameters: {:.2f}".format(test_score))
Size of training set: 84 size of validation set: 28 size of test set: 38
Best score on validation set: 0.96
Best parameters: {‘C’: 10, ‘gamma’: 0.001}
Test set score with best parameters: 0.92
验证集上的最高分数是 96%,这比之前略低,可能是因为我们使用更少的数据来训练模型。
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
# 对于每种参数组合都训练一个SVC
svm = SVC(gamma=gamma, C=C)
# 执行交叉验证
scores = cross_val_score(svm, X_trainval, y_trainval, cv=5)
# 计算交叉验证平均精度
score = np.mean(scores)
# 如果我们得到了更高的分数,则保存该分数和对应的参数
if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=‘ovr’, degree=3, gamma=0.01, kernel=‘rbf’,
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
对于每种参数设置(图中仅显示了一部分),需要计算 5 个精度值,交叉验证的每次划分都要计算一个精度值。然后,对每种参数设置计算平均验证精度。最后,选择平均验证精度最高的参数,用圆圈标记。
如果 C 和 gamma 想要尝试的取值为 0.001、
0.01、0.1、1、10 和 100,可以将其转化为下面这个字典
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
print("Parameter grid:\n{}".format(param_grid))
使用模型(SVC)、要搜索的参数网格(param_grid)与要使用的交叉验证策略(比如 5 折分层交叉验证)将 GridSearchCV 类实例化
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid, cv=5,
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=0)
我们创建的 grid_search 对象的行为就像是一个分类器,我们可以对它调用标准的 fit、predict 和 score 方法
grid_search.fit(X_train, y_train)
拟合 GridSearchCV 对象不仅会搜索最佳参数,还会利用得到最佳交叉验证性能的参数在整个训练数据集上自动拟合一个新模型。
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
我们找到的参数保存在 best_params_ 属性中,而交叉验证最佳精度(对于这种参数设置,不同划分的平均精度)保存在 best_score_ 中
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))
1. 分析交叉验证的结果
import pandas as pd
results = pd.DataFrame(grid_search.cv_results_)
results 中每一行对应一种特定的参数设置。对于每种参数设置,交叉验证所有划分的结果都被记录下来,所有划分的平均值和标准差也被记录下来。由于我们搜索的是一个二维参数网格(C 和 gamma),所以最适合用热图可视化(见图 5-8)。
scores = np.array(results.mean_test_score).reshape(6, 6)
mglearn.tools.heatmap(scores, xlabel='gamma', xticklabels=param_grid['gamma'],
ylabel='C', yticklabels=param_grid['C'], cmap="viridis")
ig, axes = plt.subplots(1, 3, figsize=(13, 5))
param_grid_linear = {'C': np.linspace(1, 2, 6),
'gamma': np.linspace(1, 2, 6)}
param_grid_one_log = {'C': np.linspace(1, 2, 6),
'gamma': np.logspace(-3, 2, 6)}
param_grid_range = {'C': np.logspace(-3, 2, 6),
'gamma': np.logspace(-7, -2, 6)}
for param_grid, ax in zip([param_grid_linear, param_grid_one_log,
param_grid_range], axes):
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
scores = grid_search.cv_results_['mean_test_score'].reshape(6, 6)
# plot the mean cross-validation scores
scores_image = mglearn.tools.heatmap(
scores, xlabel='gamma', ylabel='C', xticklabels=param_grid['gamma'],
yticklabels=param_grid['C'], cmap="viridis", ax=ax)
plt.colorbar(scores_image, ax=axes.tolist())
参数 C 和gamma 不正确的缩放以及不正确的范围造成
表示只有 gamma 的设置对精度有影响
第三张图中 C 和 gamma 对应的精度都有变化。
2. 在非网格的空间中搜索
在某些情况下,尝试所有参数的所有可能组合(正如 GridSearchCV 所做的那样)并不是一个好主意
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
scores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5),
iris.data, iris.target, cv=5)
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())
List of grids:
[{‘kernel’: [‘rbf’], ‘C’: [0.001, 0.01, 0.1, 1, 10, 100],
‘gamma’: [0.001, 0.01, 0.1, 1, 10, 100]},
{‘kernel’: [‘linear’], ‘C’: [0.001, 0.01, 0.1, 1, 10, 100]}]
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
Best parameters: {‘C’: 100, ‘kernel’: ‘rbf’, ‘gamma’: 0.01}
Best cross-validation score: 0.97
查看 cv_results_,如果 kernel 等于 ‘linear’,那么只有 C 是变化的
results = pd.DataFrame(grid_search.cv_results_)
# 我们给出的是转置后的表格,这样更适合页面显示:
3. 使用不同的交叉验证策略进行网格搜索
与 cross_val_score 类似,GridSearchCV 对分类问题默认使用分层 k 折交叉验证,对回归问题默认使用 k 折交叉验证。
但是,你可以传入任何交叉验证分离器作为 GridSearchCV 的cv 参数
调用 cross_val_score,并用 GridSearchCV的一个实例作为模型:
scores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5),
iris.data, iris.target, cv=5)
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())
Cross-validation scores: [ 0.967 1. 0.967 0.967 1. ]
Mean cross-validation score: 0.98
嵌套交叉验证的结果可以总结为“SVC 在 iris 数据集上的交叉验证平均精度为 98%
def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
outer_scores = []
# for each split of the data in the outer cross-validation
# (split method returns indices of training and test parts)
for training_samples, test_samples in outer_cv.split(X, y):
# find best parameter using inner cross-validation
best_parms = {}
best_score = -np.inf
# iterate over parameters
for parameters in parameter_grid:
# accumulate score over inner splits
cv_scores = []
# iterate over inner cross-validation
for inner_train, inner_test in inner_cv.split(
X[training_samples], y[training_samples]):
# build classifier given parameters and training data
clf = Classifier(**parameters)
clf.fit(X[inner_train], y[inner_train])
# evaluate on inner test set
score = clf.score(X[inner_test], y[inner_test])
# compute mean score over inner folds
mean_score = np.mean(cv_scores)
if mean_score > best_score:
# if better than so far, remember parameters
best_score = mean_score
best_params = parameters
# build classifier on best parameters using outer training set
clf = Classifier(**best_params)
clf.fit(X[training_samples], y[training_samples])
# evaluate
outer_scores.append(clf.score(X[test_samples], y[test_samples]))
return np.array(outer_scores)
在 iris 数据集上运行这个函数
from sklearn.model_selection import ParameterGrid, StratifiedKFold scores = nested_cv(iris.data, iris.target, StratifiedKFold(5),
StratifiedKFold(5), SVC, ParameterGrid(param_grid)) print("Cross-validation scores: {}".format(scores))
Cross-validation scores: [ 0.967 1. 0.967 0.967 1. ]
scikit-learn 不允许并行操作的嵌套。如果你在模型(比如随机森林)中使用了 n_jobs 选项,那么就不能在 GridSearchCV 使用它来搜索这个模型。