动手学机器学习(第二版)-第二章端到端的机器学习项目

最新推荐文章于 2024-08-22 10:56:53 发布

madao10086+

最新推荐文章于 2024-08-22 10:56:53 发布

阅读量968

点赞数 2

分类专栏：动手学机器学习文章标签：算法 python 机器学习

本文链接：https://blog.csdn.net/qq_36178962/article/details/107641860

版权

动手学机器学习专栏收录该内容

3 篇文章 9 订阅

订阅专栏

第二章端到端的机器学习项目

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

下载数据

import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/madao33/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    # 下一行代码提示urlError，屏蔽掉，直接使用本地文件
    # urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

快速查看数据结构

housing = load_housing_data()
housing.head()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

housing["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

housing.describe()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
count	20640.000000	20640.000000	20640.000000	20640.000000	20433.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	-119.569704	35.631861	28.639486	2635.763081	537.870553	1425.476744	499.539680	3.870671	206855.816909
std	2.003532	2.135952	12.585558	2181.615252	421.385070	1132.462122	382.329753	1.899822	115395.615874
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900	14999.000000
25%	-121.800000	33.930000	18.000000	1447.750000	296.000000	787.000000	280.000000	2.563400	119600.000000
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534800	179700.000000
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743250	264725.000000
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100	500001.000000

housing.hist(bins=100, figsize=(20, 15))
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u9bMjx3Q-1595930064219)(chap2_files/chap2_11_0.png)]

创建测试数据集

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)

len(train_set)

len(test_set)

from zlib import crc32
def test_set_check(indentfier, test_ratio):
    return crc32(np.int64(indentfier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_:test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

housing_with_id["id"] = housing["longitude"] *1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state = 42)

housing["income_cat"] = pd.cut(housing["median_income"], bins = [0., 1.5, 3.0, 4.5, 6., np.inf], labels = [1, 2, 3, 4, 5])

housing["income_cat"].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7f60c67ddc88>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-a3QrCd6w-1595930064221)(chap2_files/chap2_23_1.png)]

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

strat_test_set["income_cat"].value_counts() / len(strat_test_set)

3    0.350533
2    0.318798
4    0.176357
5    0.114583
1    0.039729
Name: income_cat, dtype: float64

# move income_cat attribute
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace = True)

strat_test_set.head()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
5241	-118.39	34.12	29.0	6447.0	1012.0	2184.0	960.0	8.2816	500001.0	<1H OCEAN
10970	-117.86	33.77	39.0	4159.0	655.0	1669.0	651.0	4.6111	240300.0	<1H OCEAN
20351	-119.05	34.21	27.0	4357.0	926.0	2110.0	876.0	3.0119	218200.0	<1H OCEAN
6568	-118.15	34.20	52.0	1786.0	306.0	1018.0	322.0	4.1518	182100.0	INLAND
13285	-117.68	34.07	32.0	1775.0	314.0	1067.0	302.0	4.0375	121300.0	INLAND

可视化和探索数据

housing = strat_train_set.copy()

housing.plot(kind="scatter", x = "longitude", y ="latitude")

<matplotlib.axes._subplots.AxesSubplot at 0x7f60c6777a90>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YQ0cA0pd-1595930064223)(chap2_files/chap2_30_1.png)]

housing.plot(kind="scatter", x = "longitude", y ="latitude", alpha = 0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x7f60c7b481d0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-6aoSlfuY-1595930064227)(chap2_files/chap2_31_1.png)]

housing.plot(kind="scatter", x="longitude",y="latitude", alpha=0.4, s=housing["population"]/100, label="population", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()

<matplotlib.legend.Legend at 0x7f60c7bfc668>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0Bcw9HxK-1595930064229)(chap2_files/chap2_32_1.png)]

查看关联

corr_matrix = housing.corr()

corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64

from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7ce79e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7cf2748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c8471b00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7f212e8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7fbc128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7eadc50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7e78518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7d67438>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7d67588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c6d0eeb8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c80cc470>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7fe43c8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f60c6db4208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7fc8b70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c83e24a8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c8315240>]],
      dtype=object)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-8fkSfiJv-1595930064230)(chap2_files/chap2_36_1.png)]

housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x7f60c85bf470>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hOJ5h5uL-1595930064231)(chap2_files/chap2_37_1.png)]

尝试属性结合

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

为机器学习算法准备数据

housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

数据清理

大多数的机器学习算法无法在缺少特征的情况下正常运行，因此需要采用一些特殊的方法处理数据，一般有以下三种选择：

去除相关的区域
去除缺少部分数据的整个属性
将缺失的部分用某个值代替(0, 均值，中值等)

这里采用值替代的方法

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_
housing_num.median().values

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

X = imputer.transform(housing_num)

housing_tr = pd.DataFrame(X, columns=housing_num.columns)

处理文本和分类属性

housing_cat = housing["ocean_proximity"]
housing_cat.head(10)

17606     <1H OCEAN
18632     <1H OCEAN
14650    NEAR OCEAN
3230         INLAND
3555      <1H OCEAN
19480        INLAND
8879      <1H OCEAN
13685        INLAND
4937      <1H OCEAN
4861      <1H OCEAN
Name: ocean_proximity, dtype: object

大多数机器学习算法处理数值会更加容易，因此在这里我们将分类转换为数值，使用Scikit-Learn的OrdinalEncoder类

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
# 新版的sklearn输入的数组必须是二维数组，否则会报错
housing_cat = np.array(housing_cat).reshape(-1,1)
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

array([[0.],
       [0.],
       [4.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.]])

ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

# 独热编码(one-hot encoding)，二进制位一位为1表示某个属性
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

housing_cat_1hot.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

独热编码在分类属性有足够多种类时，例如地区代码，种群等等，编码转换会得到一个较大数值的输入特征，这会降低训练和迭代的速度。解决的办法是将相似属性合并，或者用其他的特征替代

自定义转换器(Custom Transformers)

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinesArrtibutesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:,population_ix] / X[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        

attr_adder = CombinesArrtibutesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

特征缩放(feature scaling)

平均值规范化(min-max scaling, also called normalization)：数值范围缩放到0-1

$x=\frac{x-means(x)}{max(x)-min(x)}$

标准化(Standardization): 会出现0值，但是数值没有缩放到特定的范围(例如0-1)，在某些算法中可能引发问题，神经网络常常期望输入数据的范围在0-1之间

$x=\frac{x-mean(x)}{std(x)}$

$std(x)=\sqrt{\frac{\sum(x-mean(x))^2}{n}}$

转换数据流(Transformation Pipelines)

有许多的数据转换需要按照正确的顺序执行。Scikit-Learn提供了Pipelines类来帮助实现这种转换序列

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinesArrtibutesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

选择和训练模型

最后，你总结所有的问题，你获得并仔细地查看了数据，将数据集分为训练数据集和测试数据集，然后编写了一个转换管道(transformation pipelinws)清理数据并自动的为机器学习算法准备数据。你现在已经准备好选择和训练机器学习模型.

训练及评估训练数据集

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

# 测试训练好的线性模型
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]

print("Labels", list(some_labels))

Labels [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]

# 检验线性回归模型的在整个训练集的RMSE
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68628.19819848922

很明显，模型的训练数据集得到的RMSE较大，毕竟大多数街区的房价中位数(median_housing_values)的范围在$120,000到$265,000之内。这是模型欠拟合数据的典型例子，可能是数据特征不能提供做出准确预测的信息，亦或是模型本身不够强大。解决的方法有：

选择更加强大(复杂)的模型
给训练算法提供更多的特征
减少模型的限制

这个模型没有正则化，所以我们排除了最后一个选项(减少模型限制)，首先尝试一个复杂度更高的模型，看看结果如何。这里选择的是DecisionTreeRegressor

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor()

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

这里发现训练误差完全为0，可以判断是模型过拟合

使用交叉校验更好地评估

使用train_test_split将训练数据集分为更小的训练集和验证集

from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [70107.18598369 67427.66761479 70494.7133846  68052.83797215
 69684.83810574 74858.44630746 70286.7560609  70972.86657154
 75833.77220411 70506.52871244]
Mean: 70822.56129174255
Standard deviation: 2508.395675655727

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067]
Mean: 69052.46136345083
Standard deviation: 2731.674001798348

对比两种模型，发现决策树模型严重过拟合，预测结果比线性回归模型更差

现在尝试最后一种模型，随机森林RandomForestRegressor

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor()

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", forest_reg.predict(some_data_prepared))

Predictions: [272894.   339038.03 221886.    51069.   235920.  ]

housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

18826.653283467

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error",cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [49568.0164352  47558.38150037 50101.59486549 51937.94163799
 49526.1024044  53480.52936402 49095.33120346 47681.52808254
 52794.074608   50179.56480025]
Mean: 50192.30649017154
Standard deviation: 1895.1445809297988

import joblib

joblib.dump(forest_reg, "my_model.pkl")

['my_model.pkl']

微调模型

网格搜索(Grid Search)

一种方法是手动查找超参数(hyperparameters)，比较繁琐，也没有足够的时间去探索大量的组合
使用Scikit-Learn's GridSearchCV自动查找

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[3, 10, 30], 'max_features':[2, 4, 6, 8]},
    {'bootstrap':[False], 'n_estimators':[3, 10], 'max_features':[2, 3, 4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
             cv=5,scoring='neg_mean_squared_error',              return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30)

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63894.67087631534 {'max_features': 2, 'n_estimators': 3}
55667.41733417563 {'max_features': 2, 'n_estimators': 10}
52953.67120428983 {'max_features': 2, 'n_estimators': 30}
60759.33678765079 {'max_features': 4, 'n_estimators': 3}
52610.21756065792 {'max_features': 4, 'n_estimators': 10}
50825.02549091298 {'max_features': 4, 'n_estimators': 30}
59609.16940847891 {'max_features': 6, 'n_estimators': 3}
52329.80288580481 {'max_features': 6, 'n_estimators': 10}
50170.784779709335 {'max_features': 6, 'n_estimators': 30}
59002.639747336456 {'max_features': 8, 'n_estimators': 3}
51925.125329680406 {'max_features': 8, 'n_estimators': 10}
50163.82162133027 {'max_features': 8, 'n_estimators': 30}
61646.782726933416 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54460.336888076854 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60425.28010672884 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52606.60637645159 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
59026.038679228404 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
51279.348751331956 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}

随机搜索

数据量较小的时候可以采用网格搜索的方法，但是在超参数搜索空间较大是，更好的选择是使用RandomizedSearchCV

组合方法(Ensemble Methods)

另一种微调系统的方法就是尝试将表现最好的模型组合起来，组合常常比单个模型表现的更好

分析最好的模型和误差

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([6.85740666e-02, 6.17182312e-02, 4.27956600e-02, 1.55182236e-02,
       1.49635160e-02, 1.44385034e-02, 1.45608964e-02, 3.71913941e-01,
       5.61301486e-02, 1.14093585e-01, 4.72391450e-02, 3.23995057e-03,
       1.70264023e-01, 1.08383941e-04, 1.77172464e-03, 2.67000179e-03])

extras_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extras_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.3719139405152314, 'median_income'),
 (0.170264022826082, 'INLAND'),
 (0.11409358481656678, 'pop_per_hhold'),
 (0.06857406663435783, 'longitude'),
 (0.06171823122997493, 'latitude'),
 (0.0561301485724391, 'rooms_per_hhold'),
 (0.04723914504113055, 'bedrooms_per_room'),
 (0.04279566004600196, 'housing_median_age'),
 (0.015518223555873671, 'total_rooms'),
 (0.01496351603193718, 'total_bedrooms'),
 (0.01456089643377324, 'households'),
 (0.014438503352036969, 'population'),
 (0.003239950572947161, '<1H OCEAN'),
 (0.0026700017864008615, 'NEAR OCEAN'),
 (0.0017717246440854596, 'NEAR BAY'),
 (0.00010838394116097372, 'ISLAND')]

在测试数据集上评价你的系统

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

x_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(x_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

48081.59737866896

from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                        loc=squared_errors.mean(),
                        scale=stats.sem(squared_errors)))

array([46085.6906593 , 49997.89124977])

参考文献

madao10086+

关注

2
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
动手学机器学习(第二版)-第二章端到端的机器学习项目

第二章端到端的机器学习项目# Python ≥3.5 is requiredimport sysassert sys.version_info >= (3, 5)# Scikit-Learn ≥0.20 is requiredimport sklearnassert sklearn.__version__ >= "0.20"# Common importsimport numpy as npimport os# To plot pretty figures%matp
复制链接

扫一扫