机器学习实战笔记【2-(6,7)】-CSDN博客

本文链接：https://blog.csdn.net/qq_32939413/article/details/117260460

Gitee:https://gitee.com/Shine_Zhang/machine_learning_practices

参考：

https://zhuanlan.zhihu.com/p/88797654
《机器学习实战：基于Scikit-Learn、Keras和TensorFlow第2版》

import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from zlib import crc32

HOUSING_PATH = 'datasets/housing'


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)


housing = load_housing_data(HOUSING_PATH)
# 数据分层 这里按照收入中位数分层
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
# housing["income_cat"].hist()
# plt.show()
# 分层抽样
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,
                               random_state=42)  # n_splits是将训练数据分成train/test对的组数，可根据需要进行设置，默认为10
strat_test_set, strat_train_set = None, None

for train_index, test_index in split.split(housing, housing["income_cat"]):  # 生成索引以将数据拆分为训练集和测试集
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
# strat_train_set.info()

# 删除"income_cat"属性，恢复数据
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)


# 创建数据副本，分离标签数据
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()


# 自定义Transformation
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6


class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # print("自定义Transformation")
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            # print("aaaa", np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room][0])
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
# attr_adder = CombinedAttributesAdder()
# housing_extra_attribs = attr_adder.transform(housing.values)

# 移除不是数值类型的项
housing_num = housing.drop("ocean_proximity", axis=1)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# pipline 构造函数中除最后一个是估算器以外，前面必须是转换器
num_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),  # 处理缺失值
    ('attribs_adder', CombinedAttributesAdder()),  # 增加属性
    ('std_scaler', StandardScaler()),  # 标准化数据
])
housing_num_tr = num_pipline.fit_transform(housing_num)
housing_num_tr

array([[-1.15604281,  0.77194962,  0.74333089, ..., -0.31205452,
        -0.08649871,  0.15531753],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.21768338,
        -0.03353391, -0.83628902],
       [ 1.18684903, -1.34218285,  0.18664186, ..., -0.46531516,
        -0.09240499,  0.4222004 ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.3469342 ,
        -0.03055414, -0.52177644],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.02499488,
         0.06150916, -0.30340741],
       [-1.43579109,  0.99645926,  1.85670895, ..., -0.22852947,
        -0.09586294,  0.10180567]])


# 同时可以处理所有列的方法
from sklearn.compose import ColumnTransformer

# # 移除不是数值类型的项
housing_num = housing.drop("ocean_proximity", axis=1)
# print('housing_num:', housing_num)
num_attribs = list(housing_num)
# print('num_attribs:', num_attribs)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)

# 选择和评估数据集
# 线性回归
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
# 几个训练集实例
some_data = housing.iloc[:5]
some_labels = housing_labels[:5]
some_data_prepared = full_pipeline.transform(some_data)
print(some_data_prepared)
print("Predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t,", list(some_labels))

[[-1.15604281  0.77194962  0.74333089 -0.49323393 -0.44543821 -0.63621141
  -0.42069842 -0.61493744 -0.31205452 -0.08649871  0.15531753  1.
   0.          0.          0.          0.        ]
 [-1.17602483  0.6596948  -1.1653172  -0.90896655 -1.0369278  -0.99833135
  -1.02222705  1.33645936  0.21768338 -0.03353391 -0.83628902  1.
   0.          0.          0.          0.        ]
 [ 1.18684903 -1.34218285  0.18664186 -0.31365989 -0.15334458 -0.43363936
  -0.0933178  -0.5320456  -0.46531516 -0.09240499  0.4222004   0.
   0.          0.          0.          1.        ]
 [-0.01706767  0.31357576 -0.29052016 -0.36276217 -0.39675594  0.03604096
  -0.38343559 -1.04556555 -0.07966124  0.08973561 -0.19645314  0.
   1.          0.          0.          0.        ]
 [ 0.49247384 -0.65929936 -0.92673619  1.85619316  2.41221109  2.72415407
   2.57097492 -0.44143679 -0.35783383 -0.00419445  0.2699277   1.
   0.          0.          0.          0.        ]]
Predictions:	 [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]
Labels:		, [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]

# 使用RMSE测错误
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)  # 这种错误误差已经很大，说明当前的features不能提供预测的足够的信息或者当前模型不够强大

68628.19819848923


# 使用决策树来训练数据
from sklearn.tree import DecisionTreeRegressor


tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

tree_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, tree_predictions)
tree_rmse = np.sqrt(tree_mse)

print(tree_rmse)

0.0

# 交叉验证
from sklearn.model_selection import cross_val_score


scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)


def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())


display_scores(tree_rmse_scores)

Scores: [69675.82792311 65914.98862986 70852.38231522 69370.89549497
 71573.18392034 75212.9829044  71783.44686384 71186.60176798
 76364.14655943 70467.08826259]
Mean: 71240.15446417339
Standard deviation: 2783.778732671486

# 使用交叉验证看看回归的error
line_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
line_rmse_scores = np.sqrt(-line_scores)


display_scores(line_rmse_scores)

Scores: [66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067]
Mean: 69052.46136345083
Standard deviation: 2731.6740017983484

# 随机森林
from sklearn.ensemble import RandomForestRegressor


random_forest = RandomForestRegressor()
random_forest.fit(housing_prepared, housing_labels)

forest_predictions = random_forest.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, forest_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

18749.228174890577

# 保存模型
import joblib
joblib.dump(random_forest, "random_forest.pkl")
# and later...加载模型
my_model_loaded = joblib.load("random_forest.pkl")

# 在得到一系列可用的模型列表后，需要对该模型做微调
# Grid Search 网络搜索，使用sk对各种不同的参数组合做训练，获取最佳参数组合
from sklearn.model_selection import GridSearchCV


param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
              {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

# 获取最优的estimator
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features=8, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=30, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

# 评估分数
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

64739.590419929686 {'max_features': 2, 'n_estimators': 3}
56390.01172900751 {'max_features': 2, 'n_estimators': 10}
52922.27148535295 {'max_features': 2, 'n_estimators': 30}
60216.409558842715 {'max_features': 4, 'n_estimators': 3}
52769.43612050713 {'max_features': 4, 'n_estimators': 10}
50563.123998033945 {'max_features': 4, 'n_estimators': 30}
59872.0870489597 {'max_features': 6, 'n_estimators': 3}
52176.37289190631 {'max_features': 6, 'n_estimators': 10}
50075.85744107314 {'max_features': 6, 'n_estimators': 30}
57534.12698935773 {'max_features': 8, 'n_estimators': 3}
52146.81632651758 {'max_features': 8, 'n_estimators': 10}
49746.548736931334 {'max_features': 8, 'n_estimators': 30}
61352.403943075034 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
55081.777573816296 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60075.10384281149 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52934.215028399834 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
58571.43297177205 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
51759.814819200175 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}

# 使用最终的模型来评估测试数据
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse