- 《机器学习实战》阅读笔记
读入数据
import pandas as pd
import os
HOUSING_PATH = r"handson-ml-master\datasets\housing"
def load_housing_data(housing_path = HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
housing = load_housing_data()
housing.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude 20640 non-null float64
latitude 20640 non-null float64
housing_median_age 20640 non-null float64
total_rooms 20640 non-null float64
total_bedrooms 20433 non-null float64
population 20640 non-null float64
households 20640 non-null float64
median_income 20640 non-null float64
median_house_value 20640 non-null float64
ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
housing["ocean_proximity"].value_counts()
<1H OCEAN 9136
INLAND 6551
NEAR OCEAN 2658
NEAR BAY 2290
ISLAND 5
Name: ocean_proximity, dtype: int64
housing.describe()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
---|---|---|---|---|---|---|---|---|---|
count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins= 50, figsize = (20, 15))
plt.show()
创建测试集
import numpy as np
def split_train_test(data, test_ratio):
np.random.seed(42)
shuffle_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffle_indices[:test_set_size]
train_indices = shuffle_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set),"train+", len(test_set), "test")
16512 train+ 4128 test
import hashlib
def test_set_check(identifier, test_ratio, hash):
return hash(np.int64(identifier)).digest()[-1]<256 * test_ratio
def split_train_test_by_id(data, test_ratio, id_column, hash = hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda id_ : test_set_check(id_, test_ratio, hash))
return data.loc[~in_test_set], data.loc[in_test_set]
housing_with_id = housing.reset_index() # 在数据集加上一条索引列
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
print(len(train_set),"train+", len(test_set), "test")
16362 train+ 4278 test
#Scikit-Learn有封装好的数据集分割函数,就是train_test_split
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)
print(len(train_set),":train+", len(test_set), ":test")
16512 :train+ 4128 :test
# 创建收入类别属性
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].hist(bins= 50)
housing["income_cat"].where(housing["income_cat"] < 5 , 5.0, inplace= True) # 将所有大于5的类别合并为5
# 通过Stratified-Shuffle Split建立分层抽样
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
start_train_set = housing.loc[train_index]
start_test_set = housing.loc[test_index]
housing["income_cat"].value_counts() / len(housing)
3.0 0.350581
2.0 0.318847
4.0 0.176308
5.0 0.114438
1.0 0.039826
Name: income_cat, dtype: float64
start_train_set["income_cat"].value_counts() / len(housing)
3.0 0.280475
2.0 0.255087
4.0 0.141037
5.0 0.091521
1.0 0.031880
Name: income_cat, dtype: float64
# 删除income_cat属性,数据恢复原样
for set in (start_train_set, start_test_set):
set.drop(["income_cat"], axis = 1, inplace = True)
探索训练数据集
housing = start_train_set.copy()
将地理数据可视化
housing.plot(kind = "scatter", x = "longitude", y="latitude")
<matplotlib.axes._subplots.AxesSubplot at 0x20b283af7f0>
housing.plot(kind = "scatter", x = "longitude", y="latitude", alpha = 0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x20b28209b00>
观测房价信息
housing.plot(kind="scatter", x="longitude", y="latitude", alpha = 0.4,
s = housing["population"]/100, label = "population",
c= "median_house_value", cmap = plt.get_cmap("jet"), colorbar = True)
plt.legend()
<matplotlib.legend.Legend at 0x20b27d55e48>
寻找数据的相关性
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)
median_house_value 1.000000
median_income 0.687160
total_rooms 0.135097
housing_median_age 0.114110
households 0.064506
total_bedrooms 0.047689
population -0.026920
longitude -0.047432
latitude -0.142724
Name: median_house_value, dtype: float64
# 使用Pandas的scatter_matrix函数,可以绘制每一个属性和其他属相的相关性图像。
from pandas.tools.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms","housing_median_age"]
scatter_matrix(housing[attributes], figsize= (12,8))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B27E4A668>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B27EB7160>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B27E2F0B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B27D24278>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B27CC6A20>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B27CC6A58>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B2817C4A8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B2824E278>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B28B963C8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B28BF20B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B28CC94E0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B28D17EB8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B2A5EB4E0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B2A605A58>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B2A6AA240>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000020B2A712550>]],
dtype=object)
#最相关的是房价中位数和收入中位数
housing.plot(kind= "scatter", x = "median_income", y = "median_house_value", alpha = 0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x20b2bfd0cc0>
试验不同属性的组合
housing["room_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_households"] = housing["population"]/housing["households"]
#继续观察关联矩阵
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)
median_house_value 1.000000
median_income 0.687160
room_per_household 0.146285
total_rooms 0.135097
housing_median_age 0.114110
households 0.064506
total_bedrooms 0.047689
population_per_households -0.021985
population -0.026920
longitude -0.047432
latitude -0.142724
bedrooms_per_room -0.259984
Name: median_house_value, dtype: float64
机器学习算法的数据准备
housing = start_train_set.drop("median_house_value", axis=1)
housing_labels = start_train_set["median_house_value"].copy()
数据清理
处理缺失项
因为total_bedrooms这个项有缺失。所需要解决他,有三个方法可以解决这个问题。
- 放弃这些相应的区域
- 放弃这个属性
- 将缺失值设置为某个值(0、平均数或者中位数都可以)
通过DataFrame的dropna(),drop()和fillna()方法可以轻松完成这些工作
# housing.dropna(subset=["total_bedrooms"]) # 方法1
# housing.drop("total_bedroom", axis = 1) # 方法2
medien = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(medien) # 方法3
17606 351.0
18632 108.0
14650 471.0
3230 371.0
3555 1525.0
19480 588.0
8879 317.0
13685 293.0
4937 465.0
4861 229.0
16365 951.0
19684 559.0
19234 501.0
13956 582.0
2390 495.0
11176 649.0
15614 545.0
2953 251.0
13209 409.0
6569 261.0
5825 913.0
18086 538.0
16718 945.0
13600 278.0
13989 444.0
15168 190.0
6747 563.0
7398 366.0
5562 133.0
16121 416.0
...
12380 767.0
5618 24.0
10060 539.0
18067 438.0
4471 797.0
19786 300.0
9969 393.0
14621 1051.0
579 302.0
11682 1615.0
245 460.0
12130 537.0
16441 544.0
11016 428.0
19934 422.0
1364 34.0
1236 829.0
5364 272.0
11703 300.0
10356 449.0
15270 515.0
3754 373.0
12166 756.0
6003 932.0
7364 212.0
6563 236.0
12053 294.0
13908 872.0
11159 380.0
15775 682.0
Name: total_bedrooms, dtype: float64
# Scikit-Learn使用imputer来处理缺失值。
from sklearn.preprocessing import Imputer as SimpleImputer
imputer = SimpleImputer(strategy = "median")
# 创建一个没有文本属性的数据副本
housing_num = housing.drop("ocean_proximity", axis = 1)
imputer.fit(housing_num)
Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)
imputer.statistics_
array([-118.51 , 34.26 , 29. , 2119.5 , 433. , 1164. ,
408. , 3.5409])
housing_num.median().values
array([-118.51 , 34.26 , 29. , 2119.5 , 433. , 1164. ,
408. , 3.5409])
X = imputer.transform(housing_num)# 包含转换后特征的numpy数组
X
array([[-121.89 , 37.29 , 38. , ..., 710. , 339. ,
2.7042],
[-121.93 , 37.05 , 14. , ..., 306. , 113. ,
6.4214],
[-117.2 , 32.77 , 31. , ..., 936. , 462. ,
2.8621],
...,
[-116.4 , 34.09 , 9. , ..., 2098. , 765. ,
3.2723],
[-118.01 , 33.82 , 31. , ..., 1356. , 356. ,
4.0625],
[-122.45 , 37.77 , 52. , ..., 1269. , 639. ,
3.575 ]])
处理文本和分类属性
# 将ocean_proximity属性转换为一个分类属性,Scikit-Learn提供了一个转换器LabelEncoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
array([0, 0, 4, ..., 1, 0, 3], dtype=int64)
print(encoder.classes_)
['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']
# SciKit-Learn提供了一个OneHotEncoder编码器,可以将整数分类转换为独热编码。
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
<16512x5 sparse matrix of type '<class 'numpy.float64'>'
with 16512 stored elements in Compressed Sparse Row format>
# 这里输出的是一个Scipy的稀疏矩阵,转换成numpy数组的化,调用toarray()函数即可
housing_cat_1hot.toarray()
array([[1., 0., 0., 0., 0.],
[1., 0., 0., 0., 0.],
[0., 0., 0., 0., 1.],
...,
[0., 1., 0., 0., 0.],
[1., 0., 0., 0., 0.],
[0., 0., 0., 1., 0.]])
# 使用LabelBinarizer类可以一次性完成两个转换。
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_2hot = encoder.fit_transform(housing_cat)
housing_cat_2hot
array([[1, 0, 0, 0, 0],
[1, 0, 0, 0, 0],
[0, 0, 0, 0, 1],
...,
[0, 1, 0, 0, 0],
[1, 0, 0, 0, 0],
[0, 0, 0, 1, 0]])
encoder = LabelBinarizer(sparse_output = True)
housing_cat_2hot = encoder.fit_transform(housing_cat)
housing_cat_2hot
<16512x5 sparse matrix of type '<class 'numpy.int32'>'
with 16512 stored elements in Compressed Sparse Row format>
自定义转换器
方法,创建一个类,然后应用三个方法:
- fit() (返回自身)
- transform()
- fit_transform()
如果添加基类TransformerMixin,就可以直接得到最后一个方法。
添加基类BaseEstimator,可以在构造函数中避免*args和**kargs ,可以额外获得两个非常有用的调用超参数的方法get_params和set_params.
构建一个简单的转换器类,用来添加组合后的属性。
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # 没有 *args 或者 **krgs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y= None):
return self
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)
特征缩放
同比例缩放所有属性,常见的两种方法,是:最大-最小缩放和标准化
- 最小/-最大缩放就是归一化。对此Scikit-Learn提供了一个名为MinMaxScaler的转换器,如果希望范围0~1,可以通过超参数feature_range来调整
- 标准化,就是数据减去平均值,然后除以方差,Sklearn提供的标准转换器StandardScaler.
转换线流水
许多数据操作是按照正确的顺序进行的,Sklearn提供了Pipeline来支持这样的操作。
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy = "median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
# 因为Scikit-Learn不能处理DataFrame,因此编写一个转换器
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_name):
self.attribute_names = attribute_name
def fit(self,X, y = None):
return self
def transform(self, X):
return X[self.attribute_names].values
from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', SimpleImputer(strategy = "median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer()),
])
full_pipeline = FeatureUnion(transformer_list = [
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
array([[-1.15604281, 0.77194962, 0.74333089, ..., 0. ,
0. , 0. ],
[-1.17602483, 0.6596948 , -1.1653172 , ..., 0. ,
0. , 0. ],
[ 1.18684903, -1.34218285, 0.18664186, ..., 0. ,
0. , 1. ],
...,
[ 1.58648943, -0.72478134, -1.56295222, ..., 0. ,
0. , 0. ],
[ 0.78221312, -0.85106801, 0.18664186, ..., 0. ,
0. , 0. ],
[-1.43579109, 0.99645926, 1.85670895, ..., 0. ,
1. , 0. ]])
housing_prepared.shape
(16512, 16)
选择和训练模型
# 先建立一个线性回归模型
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Prediction:\t", lin_reg.predict(some_data_prepared))
Prediction: [210644.60459286 317768.80697211 210956.43331178 59218.98886849
189747.55849879]
print("Label:\t\t", list(some_labels))
Label: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
68628.19819848923
# 建立一个更复杂的模型,决策树。
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
max_leaf_nodes=None, min_impurity_split=1e-07,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best')
# 用训练集评估这个决策树
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
0.0
使用交叉验证来更好地进行评估
- 一种方法是利用train_test_split函数将训练集分为训练集和验证集
- 利用Sklearn进行K-者交叉验证
# 使用K折交叉验证
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring= "neg_mean_squared_error",cv=10)
tree_rmse_scores= np.sqrt(-scores)
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviaion:",scores.std())
display_scores(tree_rmse_scores)
Scores: [67676.03572833 67381.92244006 70214.55423432 68305.20744291
71301.73405342 75578.05155017 72569.67860599 69993.33598143
77010.70393664 69579.43762579]
Mean: 70961.06615990633
Standard deviaion: 3076.951727392316
# 计算一下线性回归模型的分数
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring= "neg_mean_squared_error", cv =10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
Scores: [66782.73843989 66960.118071 70347.95244419 74739.57052552
68031.13388938 71193.84183426 64969.63056405 68281.61137997
71552.91566558 67665.10082067]
Mean: 69052.46136345083
Standard deviaion: 2731.6740017983466
# 试一下RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv =10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
C:\Users\14464\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
from numpy.core.umath_tests import inner1d
Scores: [51456.36970341 49655.64685851 53348.43463562 54482.08525421
51952.74356041 54631.6502789 50652.57926391 50049.76730169
55728.91308856 52564.10718827]
Mean: 52452.22971334962
Standard deviaion: 1962.0195695190785
微调模型
- 网格搜索,利用GridSearchCV来进行搜索
from sklearn.model_selection import GridSearchCV
param_grid = [
# try 12 (3×4) combinations of hyperparameters
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
# then try 6 (2×3) combinations with bootstrap set as False
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
GridSearchCV(cv=5, error_score='raise',
estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
verbose=0, warm_start=False),
fit_params={}, iid=True, n_jobs=1,
param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='neg_mean_squared_error', verbose=0)
grid_search.best_params_
{'max_features': 8, 'n_estimators': 30}
grid_search.best_estimator_
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features=8, max_leaf_nodes=None, min_impurity_split=1e-07,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
oob_score=False, random_state=42, verbose=0, warm_start=False)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
63825.04793017674 {'max_features': 2, 'n_estimators': 3}
55643.842909084706 {'max_features': 2, 'n_estimators': 10}
53380.65668593633 {'max_features': 2, 'n_estimators': 30}
60959.138858487866 {'max_features': 4, 'n_estimators': 3}
52740.58416665252 {'max_features': 4, 'n_estimators': 10}
50374.14214614731 {'max_features': 4, 'n_estimators': 30}
58661.2866461823 {'max_features': 6, 'n_estimators': 3}
52009.973979776936 {'max_features': 6, 'n_estimators': 10}
50154.11777368494 {'max_features': 6, 'n_estimators': 30}
57865.36168014446 {'max_features': 8, 'n_estimators': 3}
51730.07550866553 {'max_features': 8, 'n_estimators': 10}
49694.85143334442 {'max_features': 8, 'n_estimators': 30}
62874.407393096284 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54561.93981572834 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59416.64631449735 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52660.24591103273 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
57490.01682787995 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
51093.90594280526 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}
pd.DataFrame(grid_search.cv_results_).head(10)
mean_fit_time | mean_score_time | mean_test_score | mean_train_score | param_bootstrap | param_max_features | param_n_estimators | params | rank_test_score | split0_test_score | ... | split2_test_score | split2_train_score | split3_test_score | split3_train_score | split4_test_score | split4_train_score | std_fit_time | std_score_time | std_test_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.056597 | 0.002652 | -4.073637e+09 | -1.107354e+09 | NaN | 2 | 3 | {'max_features': 2, 'n_estimators': 3} | 18 | -3.963584e+09 | ... | -4.194135e+09 | -1.116843e+09 | -3.906732e+09 | -1.112813e+09 | -4.169669e+09 | -1.129842e+09 | 0.005951 | 2.101120e-04 | 1.160694e+08 | 1.927217e+07 |
1 | 0.165095 | 0.007350 | -3.096237e+09 | -5.813707e+08 | NaN | 2 | 10 | {'max_features': 2, 'n_estimators': 10} | 11 | -3.070368e+09 | ... | -3.124982e+09 | -5.780873e+08 | -2.865117e+09 | -5.713421e+08 | -3.169914e+09 | -5.797944e+08 | 0.003754 | 2.034255e-04 | 1.297819e+08 | 6.782553e+06 |
2 | 0.484294 | 0.020442 | -2.849495e+09 | -4.394633e+08 | NaN | 2 | 30 | {'max_features': 2, 'n_estimators': 30} | 9 | -2.697829e+09 | ... | -2.943808e+09 | -4.374429e+08 | -2.619893e+09 | -4.374715e+08 | -2.968460e+09 | -4.451903e+08 | 0.004509 | 3.696311e-04 | 1.593649e+08 | 2.961109e+06 |
3 | 0.083129 | 0.002480 | -3.716017e+09 | -9.850011e+08 | NaN | 4 | 3 | {'max_features': 4, 'n_estimators': 3} | 16 | -3.729600e+09 | ... | -3.736527e+09 | -9.172986e+08 | -3.404974e+09 | -1.035901e+09 | -3.914186e+09 | -9.711998e+08 | 0.000595 | 1.168008e-07 | 1.690029e+08 | 4.047487e+07 |
4 | 0.261814 | 0.007144 | -2.781569e+09 | -5.160154e+08 | NaN | 4 | 10 | {'max_features': 4, 'n_estimators': 10} | 8 | -2.667093e+09 | ... | -2.891599e+09 | -4.960301e+08 | -2.613393e+09 | -5.422542e+08 | -2.949550e+09 | -5.158794e+08 | 0.001338 | 2.423495e-04 | 1.278498e+08 | 1.498960e+07 |
5 | 0.793739 | 0.020817 | -2.537554e+09 | -3.878685e+08 | NaN | 4 | 30 | {'max_features': 4, 'n_estimators': 30} | 3 | -2.387199e+09 | ... | -2.663178e+09 | -3.789712e+08 | -2.397951e+09 | -4.036920e+08 | -2.649850e+09 | -3.846171e+08 | 0.022585 | 3.372798e-04 | 1.209935e+08 | 8.424973e+06 |
6 | 0.112208 | 0.002579 | -3.441147e+09 | -9.030212e+08 | NaN | 6 | 3 | {'max_features': 6, 'n_estimators': 3} | 14 | -3.119576e+09 | ... | -3.587747e+09 | -9.360639e+08 | -3.331544e+09 | -9.025026e+08 | -3.577062e+09 | -8.612945e+08 | 0.001741 | 1.986748e-04 | 1.884229e+08 | 2.639683e+07 |
7 | 0.372794 | 0.010541 | -2.705037e+09 | -5.014210e+08 | NaN | 6 | 10 | {'max_features': 6, 'n_estimators': 10} | 6 | -2.553481e+09 | ... | -2.762945e+09 | -4.996537e+08 | -2.519522e+09 | -4.989516e+08 | -2.906270e+09 | -5.063617e+08 | 0.003179 | 5.153954e-03 | 1.464963e+08 | 3.357661e+06 |
8 | 1.081889 | 0.022045 | -2.515436e+09 | -3.840197e+08 | NaN | 6 | 30 | {'max_features': 6, 'n_estimators': 30} | 2 | -2.371924e+09 | ... | -2.607962e+09 | -3.805596e+08 | -2.351220e+09 | -3.856159e+08 | -2.662399e+09 | -3.904866e+08 | 0.012326 | 2.417753e-03 | 1.283580e+08 | 3.796810e+06 |
9 | 0.139117 | 0.002480 | -3.348400e+09 | -8.884890e+08 | NaN | 8 | 3 | {'max_features': 8, 'n_estimators': 3} | 13 | -3.351347e+09 | ... | -3.396841e+09 | -8.596460e+08 | -3.131753e+09 | -8.893698e+08 | -3.509451e+09 | -9.146734e+08 | 0.001478 | 3.504023e-07 | 1.226683e+08 | 2.730057e+07 |
10 rows × 23 columns
- 随机搜索,如果超参数的搜索范围较大时,通常会优先使用RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {
'n_estimators': randint(low=1, high= 200),
'max_features': randint(low=1, high=8),
}
forest_reg = RandomForestRegressor(random_state = 42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions = param_distribs,
n_iter = 10, cv=5, scoring= 'neg_mean_squared_error', random_state= 42)
rnd_search.fit(housing_prepared, housing_labels)
RandomizedSearchCV(cv=5, error_score='raise',
estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
verbose=0, warm_start=False),
fit_params={}, iid=True, n_iter=10, n_jobs=1,
param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B2D1104E0>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B2D6D3C18>},
pre_dispatch='2*n_jobs', random_state=42, refit=True,
return_train_score=True, scoring='neg_mean_squared_error',
verbose=0)
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
49147.15241724505 {'max_features': 7, 'n_estimators': 180}
51396.876896929905 {'max_features': 5, 'n_estimators': 15}
50798.30254230401 {'max_features': 3, 'n_estimators': 72}
50840.744513982805 {'max_features': 5, 'n_estimators': 21}
49276.17530332962 {'max_features': 7, 'n_estimators': 122}
50776.736049370644 {'max_features': 3, 'n_estimators': 75}
50682.707554646404 {'max_features': 3, 'n_estimators': 88}
49612.152530468346 {'max_features': 5, 'n_estimators': 100}
50472.610733616304 {'max_features': 3, 'n_estimators': 150}
64458.25385034794 {'max_features': 5, 'n_estimators': 2}
分析最佳模型及其错误
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
array([7.33442355e-02, 6.29090705e-02, 4.11437985e-02, 1.46726854e-02,
1.41064835e-02, 1.48742809e-02, 1.42575993e-02, 3.66158981e-01,
5.64191792e-02, 1.08792957e-01, 5.33510773e-02, 1.03114883e-02,
1.64780994e-01, 6.02803867e-05, 1.96041560e-03, 2.85647464e-03])
# 将重要性分数显示在对应的属性名称旁边
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances,attributes), reverse = True)
[(0.3661589806181342, 'median_income'),
(0.1647809935615905, 'INLAND'),
(0.10879295677551573, 'pop_per_hhold'),
(0.07334423551601242, 'longitude'),
(0.0629090704826203, 'latitude'),
(0.05641917918195401, 'rooms_per_hhold'),
(0.05335107734767581, 'bedrooms_per_room'),
(0.041143798478729635, 'housing_median_age'),
(0.014874280890402767, 'population'),
(0.014672685420543237, 'total_rooms'),
(0.014257599323407807, 'households'),
(0.014106483453584102, 'total_bedrooms'),
(0.010311488326303787, '<1H OCEAN'),
(0.002856474637320158, 'NEAR OCEAN'),
(0.00196041559947807, 'NEAR BAY'),
(6.028038672736599e-05, 'ISLAND')]
通过测试集评估系统
final_model = grid_search.best_estimator_
X_test = start_test_set.drop("median_house_value", axis =1)
y_test = start_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_mse = np.sqrt(final_mse)
final_mse
47766.00396643308
y_test.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x20b2d0c4240>
pd.Series(final_predictions).hist()
<matplotlib.axes._subplots.AxesSubplot at 0x20b2d590c50>