CV验证

最新推荐文章于 2024-08-07 09:00:00 发布

_zhj

最新推荐文章于 2024-08-07 09:00:00 发布

阅读量2k

点赞数

分类专栏：机器学习

本文链接：https://blog.csdn.net/zhj_fly/article/details/91864820

版权

机器学习专栏收录该内容

16 篇文章 1 订阅

订阅专栏

本文主要介绍网格搜索、k折CV验证和嵌套CV验证（nested）

一、网格搜索

网格搜索的目的是对超参数调优，通过尝试列举所有超参数组合得到最好的模型best_estimator，使用best_estimator来做预测。

文档

代码：

from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


def run():
    boston = datasets.load_boston()
    X = boston['data']
    y = boston['target']
    print(X.shape, y.shape)
    model = RandomForestRegressor()
    n_estimators = [10, 20, 30]
    max_depth = [10, 20, 30]
    min_samples_split = [2, 4, 6]
    param_grid = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split}
    grid = GridSearchCV(estimator=model, cv=3, param_grid=param_grid, scoring='neg_mean_absolute_error')
    grid.fit(X, y)
    print(grid.best_score_, grid.best_params_)


if __name__ == '__main__':
    run()

二、k折CV验证

将训练数据分成k份，取其中一份作为验证集，其余k-1份作为训练集。重复k次，得到k个模型，每个模型都对测试集进行预测，将k个结果平均或者加权平均作为测试集最后预测结果，k个cv结果取平均作为最后cv结果。

图片来源

代码：

import numpy as np
from sklearn.model_selection import KFold
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestRegressor


def run():
    boston = datasets.load_boston()
    X = boston['data']
    y = boston['target']
    print(X.shape, y.shape)
    folds = KFold(n_splits=5, shuffle=True, random_state=42)
    val_error = []
    for _fold, (_train, _val) in enumerate(folds.split(X)):
        print('Current fold:', _fold, end='\t')
        x_train, y_train = X[_train], y[_train]
        x_val, y_val = X[_val], y[_val]
        model = RandomForestRegressor(n_estimators=16, max_depth=20, min_samples_split=2)
        model.fit(x_train, y_train)
        val_pred = model.predict(x_val)
        error = metrics.mean_absolute_error(val_pred, y_val)
        val_error.append(error)
        print(', error:', error)
    print('validation error:', np.mean(val_error), '+/-', np.std(val_error))


if __name__ == '__main__':
    run()

三、嵌套CV验证

即将上面两种方式结合起来。

nested CV分为两层：

内层CV用来调优参数和选择模型

外层CV用来估计误差

图片来源

代码：

import numpy as np
from sklearn.model_selection import KFold
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")


def run():
    boston = datasets.load_boston()
    X = boston['data']
    y = boston['target']
    print(X.shape, y.shape)
    folds = KFold(n_splits=5, shuffle=True, random_state=42)
    val_error = []
    for _fold, (_train, _val) in enumerate(folds.split(X)):
        print('Current fold:', _fold)
        x_train, y_train = X[_train], y[_train]
        x_val, y_val = X[_val], y[_val]

        n_estimators = [10, 20, 30]
        max_depth = [10, 20, 30]
        min_samples_split = [2, 4, 6]
        param_grid = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split}
        grid = GridSearchCV(estimator=RandomForestRegressor(), cv=3, param_grid=param_grid, scoring='neg_mean_absolute_error')
        grid.fit(X, y)
        print(grid.best_params_)

        model = grid.best_estimator_
        model.fit(x_train, y_train)
        val_pred = model.predict(x_val)
        error = metrics.mean_absolute_error(val_pred, y_val)
        val_error.append(error)
        print('error:', error)
    print('validation error:', np.mean(val_error), '+/-', np.std(val_error))


if __name__ == '__main__':
    run()

使用nested CV同时调优参数和选择不同模型（以下代码复制来自stack overflow：来源）

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

iris_raw_data = load_iris()
iris_df = pd.DataFrame(np.c_[iris_raw_data.data, iris_raw_data.target],
                       columns=iris_raw_data.feature_names + ['target'])
iris_category_labels = {0:'setosa', 1:'versicolor', 2:'virginica'}
iris_df['species_name'] = iris_df['target'].apply(lambda l: iris_category_labels[int(l)])

features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
target = 'target'
X_train, X_test, y_train, y_test = train_test_split(iris_df[features], iris_df[target], test_size=.33)

pipe_knn = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('reduce_dim', KernelPCA(n_components=2)),    # select feature 2 and 4
    ('clf', KNeighborsClassifier())
    ])
params_knn = dict(scaler=[None, StandardScaler()],
                  reduce_dim=[None, KernelPCA(n_components=2)],
                  clf__n_neighbors=[2, 5, 15])
grid_search_knn = GridSearchCV(pipe_knn, param_grid=params_knn)

pipe_svc = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('reduce_dim', KernelPCA(n_components=2)),    # select feature 2 and 4
    ('clf', SVC())
    ])
params_svc = dict(scaler=[None, StandardScaler()],
                  reduce_dim=[None, KernelPCA(n_components=2)],
                  clf__C=[0.1, 1, 10, 100])
grid_search_svc = GridSearchCV(pipe_svc, param_grid=params_svc)

pipe_rf = Pipeline(steps=[
    ('clf', RandomForestClassifier())
    ])
params_rf = dict(clf__n_estimators=[10, 50, 100],
                 clf__min_samples_leaf=[2, 5, 10])

grid_search_rf = GridSearchCV(pipe_rf, param_grid=params_rf)

pipe_meta = Pipeline(steps=[('subpipes', pipe_knn)])
params_meta = dict(subpipes=[grid_search_svc, grid_search_knn, grid_search_rf])
grid_search_meta = GridSearchCV(pipe_meta, param_grid=params_meta)

grid_search_meta.fit(X_train, y_train)
print(grid_search_meta.best_estimator_)

_zhj

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
CV验证

本文主要介绍网格搜索、k折CV验证和嵌套CV验证（nested）一、网格搜索网格搜索的目的是对超参数调优，通过尝试列举所有超参数组合得到最好的模型best_estimator，使用best_estimator来做预测。文档代码：from sklearn import datasetsfrom sklearn.ensemble import RandomForestRegre...
复制链接

扫一扫

专栏目录