动手学机器学习(第二版)-第二章端到端的机器学习项目

第二章 端到端的机器学习项目

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

下载数据

import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/madao33/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    # 下一行代码提示urlError,屏蔽掉,直接使用本地文件
    # urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

快速查看数据结构

housing = load_housing_data()
housing.head()
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valueocean_proximity
0-122.2337.8841.0880.0129.0322.0126.08.3252452600.0NEAR BAY
1-122.2237.8621.07099.01106.02401.01138.08.3014358500.0NEAR BAY
2-122.2437.8552.01467.0190.0496.0177.07.2574352100.0NEAR BAY
3-122.2537.8552.01274.0235.0558.0219.05.6431341300.0NEAR BAY
4-122.2537.8552.01627.0280.0565.0259.03.8462342200.0NEAR BAY
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
housing["ocean_proximity"].value_counts()
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64
housing.describe()
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
count20640.00000020640.00000020640.00000020640.00000020433.00000020640.00000020640.00000020640.00000020640.000000
mean-119.56970435.63186128.6394862635.763081537.8705531425.476744499.5396803.870671206855.816909
std2.0035322.13595212.5855582181.615252421.3850701132.462122382.3297531.899822115395.615874
min-124.35000032.5400001.0000002.0000001.0000003.0000001.0000000.49990014999.000000
25%-121.80000033.93000018.0000001447.750000296.000000787.000000280.0000002.563400119600.000000
50%-118.49000034.26000029.0000002127.000000435.0000001166.000000409.0000003.534800179700.000000
75%-118.01000037.71000037.0000003148.000000647.0000001725.000000605.0000004.743250264725.000000
max-114.31000041.95000052.00000039320.0000006445.00000035682.0000006082.00000015.000100500001.000000
housing.hist(bins=100, figsize=(20, 15))
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u9bMjx3Q-1595930064219)(chap2_files/chap2_11_0.png)]

创建测试数据集

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
len(train_set)
16512
len(test_set)
4128
from zlib import crc32
def test_set_check(indentfier, test_ratio):
    return crc32(np.int64(indentfier)) & 0xffffffff < test_ratio * 2**32
def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_:test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
housing_with_id["id"] = housing["longitude"] *1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state = 42)
housing["income_cat"] = pd.cut(housing["median_income"], bins = [0., 1.5, 3.0, 4.5, 6., np.inf], labels = [1, 2, 3, 4, 5])
housing["income_cat"].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f60c67ddc88>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-a3QrCd6w-1595930064221)(chap2_files/chap2_23_1.png)]

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
strat_test_set["income_cat"].value_counts() / len(strat_test_set)
3    0.350533
2    0.318798
4    0.176357
5    0.114583
1    0.039729
Name: income_cat, dtype: float64
# move income_cat attribute
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace = True)
strat_test_set.head()
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valueocean_proximity
5241-118.3934.1229.06447.01012.02184.0960.08.2816500001.0<1H OCEAN
10970-117.8633.7739.04159.0655.01669.0651.04.6111240300.0<1H OCEAN
20351-119.0534.2127.04357.0926.02110.0876.03.0119218200.0<1H OCEAN
6568-118.1534.2052.01786.0306.01018.0322.04.1518182100.0INLAND
13285-117.6834.0732.01775.0314.01067.0302.04.0375121300.0INLAND

可视化和探索数据

housing = strat_train_set.copy()
housing.plot(kind="scatter", x = "longitude", y ="latitude")
<matplotlib.axes._subplots.AxesSubplot at 0x7f60c6777a90>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YQ0cA0pd-1595930064223)(chap2_files/chap2_30_1.png)]

housing.plot(kind="scatter", x = "longitude", y ="latitude", alpha = 0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f60c7b481d0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-6aoSlfuY-1595930064227)(chap2_files/chap2_31_1.png)]

housing.plot(kind="scatter", x="longitude",y="latitude", alpha=0.4, s=housing["population"]/100, label="population", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()
<matplotlib.legend.Legend at 0x7f60c7bfc668>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0Bcw9HxK-1595930064229)(chap2_files/chap2_32_1.png)]

查看关联

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7ce79e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7cf2748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c8471b00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7f212e8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7fbc128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7eadc50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7e78518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7d67438>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7d67588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c6d0eeb8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c80cc470>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7fe43c8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f60c6db4208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c7fc8b70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c83e24a8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f60c8315240>]],
      dtype=object)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-8fkSfiJv-1595930064230)(chap2_files/chap2_36_1.png)]

housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f60c85bf470>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hOJ5h5uL-1595930064231)(chap2_files/chap2_37_1.png)]

尝试属性结合

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

为机器学习算法准备数据

housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

数据清理

大多数的机器学习算法无法在缺少特征的情况下正常运行,因此需要采用一些特殊的方法处理数据,一般有以下三种选择:

  • 去除相关的区域
  • 去除缺少部分数据的整个属性
  • 将缺失的部分用某个值代替(0, 均值,中值等)

这里采用值替代的方法

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_
housing_num.median().values
array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

处理文本和分类属性

housing_cat = housing["ocean_proximity"]
housing_cat.head(10)
17606     <1H OCEAN
18632     <1H OCEAN
14650    NEAR OCEAN
3230         INLAND
3555      <1H OCEAN
19480        INLAND
8879      <1H OCEAN
13685        INLAND
4937      <1H OCEAN
4861      <1H OCEAN
Name: ocean_proximity, dtype: object

大多数机器学习算法处理数值会更加容易,因此在这里我们将分类转换为数值,使用Scikit-LearnOrdinalEncoder

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
# 新版的sklearn输入的数组必须是二维数组,否则会报错
housing_cat = np.array(housing_cat).reshape(-1,1)
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]
array([[0.],
       [0.],
       [4.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.]])
ordinal_encoder.categories_
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]
# 独热编码(one-hot encoding),二进制位一位为1表示某个属性
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>
housing_cat_1hot.toarray()
array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])
cat_encoder.categories_
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

独热编码在分类属性有足够多种类时,例如地区代码,种群等等,编码转换会得到一个较大数值的输入特征,这会降低训练和迭代的速度。解决的办法是将相似属性合并,或者用其他的特征替代

自定义转换器(Custom Transformers)

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinesArrtibutesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:,population_ix] / X[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        

attr_adder = CombinesArrtibutesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

特征缩放(feature scaling)

  • 平均值规范化(min-max scaling, also called normalization):数值范围缩放到0-1

x = x − m e a n s ( x ) m a x ( x ) − m i n ( x ) x=\frac{x-means(x)}{max(x)-min(x)} x=max(x)min(x)xmeans(x)

  • 标准化(Standardization): 会出现0值,但是数值没有缩放到特定的范围(例如0-1),在某些算法中可能引发问题,神经网络常常期望输入数据的范围在0-1之间

x = x − m e a n ( x ) s t d ( x ) x=\frac{x-mean(x)}{std(x)} x=std(x)xmean(x)

s t d ( x ) = ∑ ( x − m e a n ( x ) ) 2 n std(x)=\sqrt{\frac{\sum(x-mean(x))^2}{n}} std(x)=n(xmean(x))2

转换数据流(Transformation Pipelines)

有许多的数据转换需要按照正确的顺序执行。Scikit-Learn提供了Pipelines类来帮助实现这种转换序列

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinesArrtibutesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

选择和训练模型

最后,你总结所有的问题,你获得并仔细地查看了数据,将数据集分为训练数据集和测试数据集,然后编写了一个转换管道(transformation pipelinws)清理数据并自动的为机器学习算法准备数据。你现在已经准备好选择和训练机器学习模型.

训练及评估训练数据集

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
LinearRegression()
# 测试训练好的线性模型
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
Predictions: [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]
print("Labels", list(some_labels))
Labels [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
# 检验线性回归模型的在整个训练集的RMSE
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
68628.19819848922

很明显,模型的训练数据集得到的RMSE较大,毕竟大多数街区的房价中位数(median_housing_values)的范围在$120,000到$265,000之内。这是模型欠拟合数据的典型例子,可能是数据特征不能提供做出准确预测的信息,亦或是模型本身不够强大。解决的方法有:

  • 选择更加强大(复杂)的模型
  • 给训练算法提供更多的特征
  • 减少模型的限制

这个模型没有正则化,所以我们排除了最后一个选项(减少模型限制),首先尝试一个复杂度更高的模型,看看结果如何。这里选择的是DecisionTreeRegressor

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
DecisionTreeRegressor()
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
0.0

这里发现训练误差完全为0,可以判断是模型过拟合

使用交叉校验更好地评估

使用train_test_split将训练数据集分为更小的训练集和验证集

from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(tree_rmse_scores)
Scores: [70107.18598369 67427.66761479 70494.7133846  68052.83797215
 69684.83810574 74858.44630746 70286.7560609  70972.86657154
 75833.77220411 70506.52871244]
Mean: 70822.56129174255
Standard deviation: 2508.395675655727
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
Scores: [66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067]
Mean: 69052.46136345083
Standard deviation: 2731.674001798348

对比两种模型,发现决策树模型严重过拟合,预测结果比线性回归模型更差

现在尝试最后一种模型,随机森林RandomForestRegressor

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
RandomForestRegressor()
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", forest_reg.predict(some_data_prepared))
Predictions: [272894.   339038.03 221886.    51069.   235920.  ]
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
18826.653283467
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error",cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
Scores: [49568.0164352  47558.38150037 50101.59486549 51937.94163799
 49526.1024044  53480.52936402 49095.33120346 47681.52808254
 52794.074608   50179.56480025]
Mean: 50192.30649017154
Standard deviation: 1895.1445809297988
import joblib

joblib.dump(forest_reg, "my_model.pkl")
['my_model.pkl']

微调模型

网格搜索(Grid Search)

  • 一种方法是手动查找超参数(hyperparameters),比较繁琐,也没有足够的时间去探索大量的组合
  • 使用Scikit-Learn's GridSearchCV自动查找
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[3, 10, 30], 'max_features':[2, 4, 6, 8]},
    {'bootstrap':[False], 'n_estimators':[3, 10], 'max_features':[2, 3, 4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
             cv=5,scoring='neg_mean_squared_error',              return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')
grid_search.best_params_
{'max_features': 8, 'n_estimators': 30}
grid_search.best_estimator_
RandomForestRegressor(max_features=8, n_estimators=30)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
63894.67087631534 {'max_features': 2, 'n_estimators': 3}
55667.41733417563 {'max_features': 2, 'n_estimators': 10}
52953.67120428983 {'max_features': 2, 'n_estimators': 30}
60759.33678765079 {'max_features': 4, 'n_estimators': 3}
52610.21756065792 {'max_features': 4, 'n_estimators': 10}
50825.02549091298 {'max_features': 4, 'n_estimators': 30}
59609.16940847891 {'max_features': 6, 'n_estimators': 3}
52329.80288580481 {'max_features': 6, 'n_estimators': 10}
50170.784779709335 {'max_features': 6, 'n_estimators': 30}
59002.639747336456 {'max_features': 8, 'n_estimators': 3}
51925.125329680406 {'max_features': 8, 'n_estimators': 10}
50163.82162133027 {'max_features': 8, 'n_estimators': 30}
61646.782726933416 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54460.336888076854 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60425.28010672884 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52606.60637645159 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
59026.038679228404 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
51279.348751331956 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}

随机搜索

数据量较小的时候可以采用网格搜索的方法,但是在超参数搜索空间较大是,更好的选择是使用RandomizedSearchCV

组合方法(Ensemble Methods)

另一种微调系统的方法就是尝试将表现最好的模型组合起来,组合常常比单个模型表现的更好

分析最好的模型和误差

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
array([6.85740666e-02, 6.17182312e-02, 4.27956600e-02, 1.55182236e-02,
       1.49635160e-02, 1.44385034e-02, 1.45608964e-02, 3.71913941e-01,
       5.61301486e-02, 1.14093585e-01, 4.72391450e-02, 3.23995057e-03,
       1.70264023e-01, 1.08383941e-04, 1.77172464e-03, 2.67000179e-03])
extras_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extras_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
[(0.3719139405152314, 'median_income'),
 (0.170264022826082, 'INLAND'),
 (0.11409358481656678, 'pop_per_hhold'),
 (0.06857406663435783, 'longitude'),
 (0.06171823122997493, 'latitude'),
 (0.0561301485724391, 'rooms_per_hhold'),
 (0.04723914504113055, 'bedrooms_per_room'),
 (0.04279566004600196, 'housing_median_age'),
 (0.015518223555873671, 'total_rooms'),
 (0.01496351603193718, 'total_bedrooms'),
 (0.01456089643377324, 'households'),
 (0.014438503352036969, 'population'),
 (0.003239950572947161, '<1H OCEAN'),
 (0.0026700017864008615, 'NEAR OCEAN'),
 (0.0017717246440854596, 'NEAR BAY'),
 (0.00010838394116097372, 'ISLAND')]

在测试数据集上评价你的系统

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

x_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(x_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse
48081.59737866896
from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                        loc=squared_errors.mean(),
                        scale=stats.sem(squared_errors)))
array([46085.6906593 , 49997.89124977])

参考文献

  1. 《Hands-on Machine Learning with Scikit-Learn, Keras & TensorFlow》(2nd Edition)
  2. github 代码地址
  • 2
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值