CV验证

本文主要介绍网格搜索、k折CV验证和嵌套CV验证(nested)

一、网格搜索

网格搜索的目的是对超参数调优,通过尝试列举所有超参数组合得到最好的模型best_estimator,使用best_estimator来做预测。

文档

代码:

from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


def run():
    boston = datasets.load_boston()
    X = boston['data']
    y = boston['target']
    print(X.shape, y.shape)
    model = RandomForestRegressor()
    n_estimators = [10, 20, 30]
    max_depth = [10, 20, 30]
    min_samples_split = [2, 4, 6]
    param_grid = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split}
    grid = GridSearchCV(estimator=model, cv=3, param_grid=param_grid, scoring='neg_mean_absolute_error')
    grid.fit(X, y)
    print(grid.best_score_, grid.best_params_)


if __name__ == '__main__':
    run()

二、k折CV验证

将训练数据分成k份,取其中一份作为验证集,其余k-1份作为训练集。重复k次,得到k个模型,每个模型都对测试集进行预测,将k个结果平均或者加权平均作为测试集最后预测结果,k个cv结果取平均作为最后cv结果。

图片来源

代码:

import numpy as np
from sklearn.model_selection import KFold
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestRegressor


def run():
    boston = datasets.load_boston()
    X = boston['data']
    y = boston['target']
    print(X.shape, y.shape)
    folds = KFold(n_splits=5, shuffle=True, random_state=42)
    val_error = []
    for _fold, (_train, _val) in enumerate(folds.split(X)):
        print('Current fold:', _fold, end='\t')
        x_train, y_train = X[_train], y[_train]
        x_val, y_val = X[_val], y[_val]
        model = RandomForestRegressor(n_estimators=16, max_depth=20, min_samples_split=2)
        model.fit(x_train, y_train)
        val_pred = model.predict(x_val)
        error = metrics.mean_absolute_error(val_pred, y_val)
        val_error.append(error)
        print(', error:', error)
    print('validation error:', np.mean(val_error), '+/-', np.std(val_error))


if __name__ == '__main__':
    run()

三、嵌套CV验证

即将上面两种方式结合起来。

nested CV分为两层:

内层CV用来调优参数和选择模型

外层CV用来估计误差

图片来源

代码:

import numpy as np
from sklearn.model_selection import KFold
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")


def run():
    boston = datasets.load_boston()
    X = boston['data']
    y = boston['target']
    print(X.shape, y.shape)
    folds = KFold(n_splits=5, shuffle=True, random_state=42)
    val_error = []
    for _fold, (_train, _val) in enumerate(folds.split(X)):
        print('Current fold:', _fold)
        x_train, y_train = X[_train], y[_train]
        x_val, y_val = X[_val], y[_val]

        n_estimators = [10, 20, 30]
        max_depth = [10, 20, 30]
        min_samples_split = [2, 4, 6]
        param_grid = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split}
        grid = GridSearchCV(estimator=RandomForestRegressor(), cv=3, param_grid=param_grid, scoring='neg_mean_absolute_error')
        grid.fit(X, y)
        print(grid.best_params_)

        model = grid.best_estimator_
        model.fit(x_train, y_train)
        val_pred = model.predict(x_val)
        error = metrics.mean_absolute_error(val_pred, y_val)
        val_error.append(error)
        print('error:', error)
    print('validation error:', np.mean(val_error), '+/-', np.std(val_error))


if __name__ == '__main__':
    run()

使用nested CV同时调优参数和选择不同模型(以下代码复制来自stack overflow:来源

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

iris_raw_data = load_iris()
iris_df = pd.DataFrame(np.c_[iris_raw_data.data, iris_raw_data.target],
                       columns=iris_raw_data.feature_names + ['target'])
iris_category_labels = {0:'setosa', 1:'versicolor', 2:'virginica'}
iris_df['species_name'] = iris_df['target'].apply(lambda l: iris_category_labels[int(l)])

features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
target = 'target'
X_train, X_test, y_train, y_test = train_test_split(iris_df[features], iris_df[target], test_size=.33)

pipe_knn = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('reduce_dim', KernelPCA(n_components=2)),    # select feature 2 and 4
    ('clf', KNeighborsClassifier())
    ])
params_knn = dict(scaler=[None, StandardScaler()],
                  reduce_dim=[None, KernelPCA(n_components=2)],
                  clf__n_neighbors=[2, 5, 15])
grid_search_knn = GridSearchCV(pipe_knn, param_grid=params_knn)

pipe_svc = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('reduce_dim', KernelPCA(n_components=2)),    # select feature 2 and 4
    ('clf', SVC())
    ])
params_svc = dict(scaler=[None, StandardScaler()],
                  reduce_dim=[None, KernelPCA(n_components=2)],
                  clf__C=[0.1, 1, 10, 100])
grid_search_svc = GridSearchCV(pipe_svc, param_grid=params_svc)

pipe_rf = Pipeline(steps=[
    ('clf', RandomForestClassifier())
    ])
params_rf = dict(clf__n_estimators=[10, 50, 100],
                 clf__min_samples_leaf=[2, 5, 10])

grid_search_rf = GridSearchCV(pipe_rf, param_grid=params_rf)

pipe_meta = Pipeline(steps=[('subpipes', pipe_knn)])
params_meta = dict(subpipes=[grid_search_svc, grid_search_knn, grid_search_rf])
grid_search_meta = GridSearchCV(pipe_meta, param_grid=params_meta)

grid_search_meta.fit(X_train, y_train)
print(grid_search_meta.best_estimator_)

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值