【机器学习实战】2_predict_median_house_values代码《Hands-On Machine Learning with Scikit-Learn&TensorFlow》

最新推荐文章于 2023-04-14 13:29:30 发布
wifi连不上
最新推荐文章于 2023-04-14 13:29:30 发布
阅读量436
点赞数
分类专栏： python 文章标签： python 机器学习
本文链接：https://blog.csdn.net/wifi_wuxian/article/details/96012063
版权
python 专栏收录该内容
28 篇文章 2 订阅
订阅专栏
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 17 14:53:17 2018

@author: Administrator
"""

#from __future__ import division,print_fuction,unicode_literals
#os 模块提供了非常丰富的方法用来处理文件和目录。
import os
#import tarfile
#from six.moves import urllib
import matplotlib.pyplot as plt
import pandas as pd
housing_path = r'D:\chengmi_吴限\python\handson-ml-master\handson-ml-master\datasets\housing\\'

def load_housing_data(housing_path):
    csv_path = os.path.join(housing_path,'housing.csv')
    file_path0 = open(csv_path,'r', encoding='UTF-8')
    return pd.read_csv(file_path0)
#保存图片
def save_fig(fig_id,tight_layout=True,fig_extension='png',resolution=300):
    path = os.path.join(housing_path,fig_id + "." + fig_extension)
    print('Saving figure',fig_id)
    if tight_layout:
        plt.tight_layout() #紧凑显示图片
    plt.savefig(path,format = fig_extension,dpi = resolution)
    
housing = load_housing_data(housing_path)
housing.head()
housing.info()

housing['ocean_proximity'].value_counts()

housing.describe()
housing['longitude'].describe()

#hist直方图
housing.hist(bins = 50 ,figsize= (20,15))
save_fig('attribute_histogram_plots')
plt.show()


import numpy as np
np.random.seed(42)

'''
解决的办法之一
1-是保存第一次运行得到的测试集，并在随后的过程加载。
2-另一种方法是在调用 np.random.permutation() 之前，设置随机数生成器的种子（比如 np.random.seed(42) ），
以产生总是相同的洗牌指数（shuffled indices）
'''

'''
from numpy import *
num=0
while(num<5):
    random.seed(5) #使用相同的seed( )值,则每次生成的随即数都相同
    print(random.random())
    num+=1 
################
    
from numpy import *
num=0
random.seed(5) #如果不设置这个值，则系统根据时间来自己选择这个值，此时每次生成的随机数因时间差异而不同。
while(num<5):
    print(random.random())
    num+=1
'''
# 创建测试集，只要随机挑选一些实例，一般是数据集的20%
def split_train_test(data,test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    #函数shuffle与permutation都是对原来的数组进行重新洗牌（即随机打乱原来的元素顺序）
    #区别在于shuffle直接在原来的数组上进行操作，改变原来数组的顺序，无返回值。
    #而permutation不直接在原来的数组上进行操作，而是返回一个新的打乱顺序的数组，并不改变原来的数组。
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.loc[test_indices]
#len(housing)
#np.random.permutation(len(housing))
#int(len(housing))
#housing.head()
#len(housing)
train_set,test_set=split_train_test(housing,0.2)
print(len(train_set),'train+',len(test_set),'test')
'''
a=int(len(housing)*0.2)
housing.iloc[np.random.permutation(len(housing))[:int(len(housing)*0.2)]]


'''
'''
解决的办法之一
1-是保存第一次运行得到的测试集，并在随后的过程加载。
2-另一种方法是在调用 np.random.permutation() 之前，设置随机数生成器的种子（比如 np.random.seed(42) ），
以产生总是相同的洗牌指数（shuffled indices）
但是如果数据集更新，这两个方法都会失效。一个通常的解决办法是使用每个实例的ID来判
定这个实例是否应该放入测试集（假设每个实例都有唯一并且不变的ID）。例如，你可以计
算出每个实例ID的哈希值，只保留其最后一个字节，如果该值小于等于 51（约为 256 的55
20%），就将其放入测试集。这样可以保证在多次运行中，测试集保持不变，即使更新了数
据集。新的测试集会包含新实例中的 20%，但不会有之前位于训练集的实例。下面是一种可
用的方法
'''
###1################
import hashlib
def test_set_check(identifier,test_ratio,hash):
    return hash(np.int64(identifier)).digest()[-1]<256*test_ratio

def split_train_test_by_id(data,test_ratio,id_column,hash = hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_:test_set_check(id_,test_ratio,hash))
    return data.loc[~in_test_set],data.loc[in_test_set]
####2###############
    '''
from zlib import crc32
def test_set_check(identifier,test_ratio):
    return crc32(np.int64(identification)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data,test_ratio,id_colum):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_,test_ratio))
    return data.loc[~in_test_set],data.loc[in_test_set]
    '''
'''
#使用行索引作为 ID：
'''
housing_with_id = housing.reset_index() # adds an 'index' column
train_set,test_set = split_train_test_by_id(housing_with_id,0.2,'index')    
'''
#如果使用行索引作为唯一识别码，你需要保证新数据都放到现有数据的尾部，且没有行被删
#除。如果做不到，则可以用最稳定的特征来创建唯一识别码。例如，一个区的维度和经度在
#几百万年之内是不变的，所以可以将两者结合成一个 ID：
'''
housing_with_id['id'] = housing['longitude']*1000 + housing['latitude']
train_set,test_set = split_train_test_by_id(housing_with_id,0.2,'id')

train_set.head()
train_set.info()

#housing_with_id['id'] = housing['longitude']*1000 + housing['latitude']
#train_set,test_set = split_train_test_by_id(housing_with_id,0.2,'id')

''' train_test_split 将数据集分割成多个子集,它的作用和之前的函数 split_train_test 很像，
并带有其它一些功能,首先，它有一个 random_state 参数，可以设定前面讲过的随机生成器种子；第二，你可
以将种子传递给多个行数相同的数据集，可以在相同的索引上分割数据集（这个功能非常有
用，比如你的标签值是放在另一个 DataFrame 里的
'''

from sklearn.model_selection import train_test_split
train_set,test_set = train_test_split(housing,test_size = 0.2,random_state = 42)
test_set.head()
test_set.info()

housing['median_income'].hist()
train_set['median_income'].hist()
test_set['median_income'].hist()
'''
大多数的收入中位数的值聚集在 2-5（万美元），但是一些收入中位数会超过 6。数据集中的
每个分层都要有足够的实例位于你的数据中，这点很重要。否则，对分层重要性的评估就会
有偏差。这意味着，你不能有过多的分层，且每个分层都要足够大。后面的代码通过将收入
中位数除以 1.5（以限制收入分类的数量），创建了一个收入类别属性，用 ceil 对值舍入
（以产生离散的分类），然后将所有大于 5的分类归入到分类 5：
'''
housing['income_cat'] = np.ceil(housing['median_income']/1.5)
housing['income_cat'].where(housing['income_cat']<5,5.0,inplace=True)

set(housing['income_cat'])
housing['income_cat'].value_counts()
housing.info()
housing['income_cat'].hist()


'''
根据收入分类，进行分层采样。你可以使用 Scikit-Learn
的 StratifiedShuffleSplit 类：
'''
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits =1,test_size =0.2,random_state =42)
for train_index,test_index in split.split(housing,housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
strat_test_set['income_cat'].value_counts()/len(strat_test_set)

housing['income_cat'].value_counts()/len(housing)


def income_cat_proportions(data):
    return data['income_cat'].value_counts() / len(data)
 
train_set,test_set = train_test_split(housing,test_size = 0.2, random_state = 42)

compare_props = pd.DataFrame({
        'Overall':income_cat_proportions(housing),
        'Stratified':income_cat_proportions(strat_test_set),
        'Random':income_cat_proportions(test_set),
        }).sort_index()
    
'''分层采样和纯随机采样的样本偏差比较
可以看到，分层采样测试集的收入分类比例
与总数据集几乎相同，而随机采样数据集偏差严重。
'''    
compare_props['Rand. %error'] = 100*compare_props['Random'] / compare_props['Overall'] -100
compare_props['Strat. %error'] = 100*compare_props['Stratified'] / compare_props['Overall'] -100
compare_props
#   ??????????????????? 
''' 
你需要删除 income_cat 属性，使数据回到初始状态'''
for set_ in(strat_train_set,strat_test_set):
    set_.drop('income_cat',axis = 1,inplace = True)

#数据探索和可视化、发现规律
housing.plot(kind = 'scatter',x ='longitude',y = 'latitude')
save_fig('bad_visualization_plot')
'''
将 alpha 设为 0.1，可以更容易看出数据点的密度,显示高密度区域的散点图
'''
housing.plot(kind = 'scatter',x ='longitude',y = 'latitude',alpha = 0.1)
save_fig( 'better_visualization_plot')

'''
#房价。每个圈的半径表示街区的人口（选项 s ），颜色代表价格（选
项 c ）。我们用预先定义的名为 jet 的颜色图（选项 cmap ），它的范围是从蓝色（低价）
到红色（高价）：
'''
housing.plot(kind = 'scatter',x='longitude',y='latitude',alpha = 0.4, s=housing['population']/100,
             label = 'population',c = 'median_house_value',cmap = plt.get_cmap('jet'),colorbar = True,)
plt.legend()
save_fig( 'housing_visualization_plot')
'''
这张图说明房价和位置（比如，靠海）和人口密度联系密切，这点你可能早就知道。可以使
用聚类算法来检测主要的聚集，用一个新的特征值测量聚集中心的距离。尽管北加州海岸区
域的房价不是非常高，但离大海距离属性也可能很有用，所以这不是用一个简单的规则就可
以定义的问题
'''
'''
因为数据集并不是非常大，你可以很容易地使用 corr() 方法计算出每对属性间的标准相关系
数（standard correlation coefficient，也称作皮尔逊相关系数）：
'''
corr_matrix = housing.corr()
housing.info()

'''
import matplotlib.image as mpimg
california_img = mpimg.imread(PROJECT_ROOT_DIR+'/images/end_to_end_project/california.png')
ot(kind='scatter',x='longitude',y='latitude',figsize=(10,7),s=housing['population']/100,
   label='Population',c='median_house_value',cmap=plt.get_cmap('jet'),
   colorbar=False,alpha = 0.4,)
'''

'''
现在来看下每个属性和房价中位数的关联度：
'''
corr_matrix['median_house_value'].sort_values(ascending = False )

from pandas.plotting import scatter_matrix
attributes = ['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(housing[attributes],figsize = (12,8))
save_fig('scatter_matrix_plot')

housing.plot(kind = 'scatter',x='median_income',y='median_house_value',alpha = 0.1)
plt.axis([0,16,0,550000])
save_fig('income_vs_house_value_scatterplot')

housing['rooms_per_household']= housing['total_rooms']/housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household'] = housing['population']/housing['households']

corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

housing.plot(kind='scatter',x='rooms_per_household',y='median_house_value',alpha=0.2)
plt.axis([0,5,0,520000])
plt.show()

housing.describe()

##为机器学习准备数据
'''
现在来为机器学习算法准备数据。不要手工来做，你需要写一些函数，理由如下：
函数可以让你在任何数据集上（比如，你下一次获取的是一个新的数据集）方便地进行
重复数据转换。
你能慢慢建立一个转换函数库，可以在未来的项目中复用。
在将数据传给算法之前，你可以在实时系统中使用这些函数。
这可以让你方便地尝试多种数据转换，查看哪些转换方法结合起来效果最好。

但是，还是先回到干净的训练集（通过再次复制 strat_train_set ），将预测量和标签分开，
因为我们不想对预测量和目标值应用相同的转换
（注意 drop() 创建了一份数据的备份，而不影响 strat_train_set ）：
'''
housing = strat_train_set.drop('median_house_value',axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

#数据清洗
'''
大多机器学习算法不能处理缺失的特征，因此先创建一些函数来处理特征缺失的问题。前
面，你应该注意到了属性 total_bedrooms 有一些缺失值。有三个解决选项：
去掉对应的街区；
去掉整个属性；
进行赋值（0、平均值、中位数等等）
用DataFrame的dropna(),drop(),filla()方法可以方便实现：
'''

sample_incomplete_rows = housing[housing.isnull().any(axis=1)]
housing.isnull().any(axis=1).sum() #按行查看空值有多少行
housing.isnull().any(axis=0).sum()
sample_incomplete_rows 
sample_incomplete_rows.dropna(subset = ['total_bedrooms']) #选项1
sample_incomplete_rows.info()

housing.dropna(subset = ['total_bedrooms'])

158+16354

housing.drop('total_bedrooms',axis = 1)

sample_incomplete_rows.drop('total_bedrooms',axis = 1)  #选项2


median = housing['total_bedrooms'].median()
housing['total_bedrooms'].fillna(median) #选项3

sample_incomplete_rows['total_bedrooms'].fillna(median,inplace = True)
sample_incomplete_rows
sample_incomplete_rows.isnull().any(axis=0).sum()
'''
如果选择选项 3，你需要计算训练集的中位数，用中位数填充训练集的缺失值，不要忘记保存
该中位数。后面用测试集评估系统时，需要替换测试集中的缺失值，也可以用来实时替换新
数据中的缺失值。
Scikit-Learn 提供了一个方便的类来处理缺失值： Imputer 。下面是其使用方法：首先，需要
创建一个 Imputer 实例，指定用某属性的中位数来替换该属性所有的缺失值：
'''
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median')
'''
因为只有数值属性才能算出中位数，我们需要创建一份不包括文本属性 ocean_proximity 的数
据副本
'''
housing_num = housing.drop('ocean_proximity',axis=1)

#现在，就可以用 fit() 方法将 imputer 实例拟合到训练数据：
imputer.fit(housing_num)
'''
imputer 计算出了每个属性的中位数，并将结果保存在了实例变量 statistics_ 中。虽然此时
只有属性 total_bedrooms 存在缺失值，但我们不能确定在以后的新的数据中会不会有其他属
性也存在缺失值，所以安全的做法是将 imputer 应用到每个数值
'''
#statistics_是一种属性

imputer.statistics_ #每个变量的中位数
housing_num.median().values
#
'''
现在，你就可以使用这个“训练过的” imputer 来对训练集进行转换，将缺失值替换为中位数：
'''
X = imputer.transform(housing_num) #是一个numpy数组

'''
结果是一个包含转换后特征的普通的 Numpy 数组。如果你想将其放回到
Pandas DataFrame 中，也很简单
'''
housing_tr =pd.DataFrame(X,columns=housing_num.columns)

pd.DataFrame(X)#这样的话会缺少列名

housing_tr.loc[sample_incomplete_rows.index.values]

# =============================================================================
# 处理文本和类别属性
# =============================================================================
'''
前面，我们丢弃了类别属性 ocean_proximity ，因为它是一个文本属性，不能计算出中位数。
大多数机器学习算法跟喜欢和数字打交道，所以让我们把这些文本标签转换为数字。
Scikit-Learn 为这个任务提供了一个转换器 LabelEncoder ：
'''
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing['ocean_proximity']
housing_cat_encoded =encoder.fit_transform(housing_cat)
housing_cat_encoded

'''
译注:
在原书中使用 LabelEncoder 转换器来转换文本特征列的方式是错误的，该转换器只能用
来转换标签（正如其名）。在这里使用 LabelEncoder 没有出错的原因是该数据只有一列
文本特征值，在有多个文本特征列的时候就会出错。应使用 factorize() 方法来进行操
作：
'''
housing_cat_encoded,housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]

'''
好了一些，现在就可以在任何 ML 算法里用这个数值数据了。你可以查看映射表，编码器是通
过属性 classes_ 来学习的（ <1H OCEAN 被映射为 0， INLAND 被映射为 1，等等）：
'''
print(encoder.classes_)

'''
这种做法的问题是，ML 算法会认为两个临近的值比两个疏远的值要更相似。显然这样不对
（比如，分类 0 和 4 比 0 和 1 更相似）。要解决这个问题，一个常见的方法是给每个分类创
建一个二元属性：当分类是 <1H OCEAN ，该属性为 1（否则为 0），当分类是 INLAND ，另一
个属性等于 1（否则为 0），以此类推。这称作独热编码（One-Hot Encoding），因为只有一
个属性会等于 1（热），其余会是 0（冷）。
Scikit-Learn 提供了一个编码器 OneHotEncoder ，用于将整数分类值转变为独热向量。注
意 fit_transform() 用于 2D 数组，而 housing_cat_encoded 是一个 1D 数组，所以需要将其变
形
'''
# =============================================================================
# 那么reshape(1,-1)呢？也就是直接变成了一行了。那这个-1在这里要怎么理解呢？跟进numpy库官网的介绍，
# 这里的-1被理解为unspecified value，意思是未指定为给定的。如果我只需要特定的行数，列数多少我无所谓，我只需要指定行数，
# 那么列数直接用-1代替就行了，计算机帮我们算赢有多少列，反之亦然。所以-1在这里应该可以理解为一个正整数通配符，它代替任何整数。
#指定为-1的时候，其行或列会随机分配一个数据
# =============================================================================


from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
#注意 fit_transform() 用于 2D 数组，而 housing_cat_encoded 是一个 1D 数组，所以需要将其变形
'''
注意输出结果是一个 SciPy 稀疏矩阵，而不是 NumPy 数组。当类别属性有数千个分类时，
这样非常有用。经过独热编码，我们得到了一个有数千列的矩阵，这个矩阵每行只有一个 1，
其余都是 0。使用大量内存来存储这些 0 非常浪费，所以稀疏矩阵只存储非零元素的位置。
你可以像一个 2D 数据那样进行使用，但是如果你真的想将其转变成一个（密集的）NumPy
数组，只需调用 toarray() 方法：关联矩阵
'''
housing_cat_1hot.toarray()
'''
使用类 LabelBinarizer ，我们可以用一步执行这两个转换（从文本分类到整数分类，再从整
数分类到独热向量）：
'''
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot

'''
注意默认返回的结果是一个密集 NumPy 数组。向构造器 LabelBinarizer 传
递 sparse_output=True ，就可以得到一个稀疏矩阵
译注:
在原书中使用 LabelBinarizer 的方式也是错误的，该类也应用于标签列的转换。正确做
法是使用sklearn即将提供的 CategoricalEncoder 类。如果在你阅读此文时sklearn中尚未
提供此类，用如下方式代替：
'''
'''
每个子流水线都以一个选择转换器开始：通过选择对应的属性（数值或分类）、丢弃其它
的，来转换数据，并将输出 DataFrame 转变成一个 NumPy 数组。Scikit-Learn 没有工具来处
理 Pandas DataFrame ，因此我们需要写一个简单的自定义转换器来做这项工作：
'''
#from sklearn import CategoricalEncoder

from sklearn.base import BaseEstimator,TransformerMixin

from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

rooms_ix,bedrooms_ix,population_ix,household_ix = 3,4,5,6
class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self,add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self,X,y = None):
        return self
    def transform(self,X,y=None):
        rooms_per_household = X[:,rooms_ix]/X[:,household_ix]
        population_per_household= X[:,population_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household,population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room= False)
housing_extra_attribs=attr_adder.transform(housing.values)

housing_extra_attribs = pd.DataFrame(
        housing_extra_attribs,
        columns=list(housing.columns)+["rooms_per_household", "population_per_household"])

housing_extra_attribs.head()        
 housing_extra_attribs.info() 
'''
转换流水线
你已经看到，存在许多数据转换步骤，需要按一定的顺序执行。幸运的是，Scikit-Learn 提供
了类 Pipeline ，来进行这一系列的转换。下面是一个数值属性的小流水线：
'''          
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('imputer',Imputer(strategy = 'median')),('attribs_adder',CombinedAttributesAdder()),
        ('attribs_adder',CombinedAttributesAdder()),
        ('std_scaler',StandardScaler()),
        ])
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr
'''
from future_encoders import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']
full_pipeline = ColumnTransformer([
        ('num',num_pipeline,num_attribs),
        ('cat',OneHotEncoder(),cat_attribs),
        ])
housing_prepared = full_pipeline.fit_transform(housing)
 ''' 
'''

自定义转换器
尽管 Scikit-Learn 提供了许多有用的转换器，你还是需要自己动手写转换器执行任务，
比如自定义的清理操作，或属性组合。你需要让自制的转换器与 Scikit-Learn 组件（比如流水线）无缝衔接工作，
因为 Scikit-Learn 是依赖鸭子类型的（而不是继承），你所需要做的是创建一个类并执行三个方法：
fit()（返回self），transform()，和fit_transform()。通过添加TransformerMixin作为基类，
可以很容易地得到最后一个。另外，如果你添加BaseEstimator作为基类（且构造器中避免使用*args和**kargs），
你就能得到两个额外的方法（get_params()和set_params()），二者可以方便地进行超参数自动微调。
例如，一个小转换器类添加了上面讨论的属性：
'''  
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names].values
 
class NewLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)

    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self

    def transform(self, x, y=0):
        return self.encoder.transform(x)
    
from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

num_pipeline = Pipeline([
        ('selector',DataFrameSelector(num_attribs)),
        ('imputer',Imputer(strategy='median')),
        ('attribs_adder',CombinedAttributesAdder()),
        ('std_scaler',StandardScaler()),
        ])
cat_pipeline= Pipeline([
        ('selector',DataFrameSelector(cat_attribs)),
        ('cat_encoder', NewLabelBinarizer()),
        ])
full_pipeline = FeatureUnion(transformer_list=[
        ('num_pipeline',num_pipeline),
        ('cat_pipeline',cat_pipeline),
        ])

from sklearn.base import TransformerMixin #gives fit_transform method for free
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape
    
'''
选择并训练模型
可到这一步了！你在前面限定了问题、获得了数据、探索了数据、采样了一个测试集、写了
自动化的转换流水线来清理和为算法准备数据。现在，你已经准备好选择并训练一个机器学
习模型了。
'''
'''
你现在就有了一个可用的线性回归模型。用一些训练集中的实例做下验证：
'''
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

some_data = housing.iloc[:5]
some_data.info() 
some_label = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print('Predictions:\t',lin_reg.predict(some_data_prepared))
print('Labels:\t\t',list(some_label))

'''
使用 ScikitLearn 的 mean_squared_error 函数，用全部训练集来计算下这个回归模型的 RMSE：
'''
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels,housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
'''
OK，有总比没有强，但显然结果并不好：大多数街区的 median_housing_values 位于 120000
到 265000 美元之间，因此预测误差 68628 美元不能让人满意。
这是一个模型欠拟合训练数据的例子。
当这种情况发生时，意味着特征没有提供足够多的信息来做出一个好的预测，或
者模型并不强大。就像前一章看到的，修复欠拟合的主要方法是选择一个更强大的模型，给
训练算法提供更好的特征，或去掉模型上的限制。这个模型还没有正则化，所以排除了最后
一个选项。你可以尝试添加更多特征（比如，人口的对数值），但是首先让我们尝试一个更
为复杂的模型，看看效果。
来训练一个 DecisionTreeRegressor 。这是一个强大的模型，可以发现数据中复杂的非线性关
系（决策树会在第 6 章详细讲解）。代码看起来很熟悉：
'''
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels,housing_predictions)
tree_rmses = np.sqrt(tree_mse)
tree_rmses
'''
等一下，发生了什么？没有误差？这个模型可能是绝对完美的吗？当然，更大可能性是这个
模型严重过拟合数据。如何确定呢？如前所述，直到你准备运行一个具备足够信心的模型，
都不要碰测试集，因此你需要使用训练集的部分数据来做训练，用一部分来做模型验证。

使用交叉验证做更佳的评估
评估决策树模型的一种方法是用函数 train_test_split 来分割训练集，得到一个更小的训练
集和一个验证集，然后用更小的训练集来训练模型，用验证集来评估。这需要一定工作量，
并不难而且也可行。

另一种更好的方法是使用 Scikit-Learn 的交叉验证功能。下面的代码采用了 K 折交叉验证
（K-fold cross-validation）：它随机地将训练集分成十个不同的子集，成为“折”，然后训练评
估决策树模型 10 次，每次选一个不用的折来做评估，用其它 9 个来做训练。结果是一个包含
10 个评分的数组：
'''
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg,housing_prepared,housing_labels,
                         scoring='neg_mean_squared_error',cv=10)
tree_rmse_scores = np.sqrt(-scores)
tree_rmse_scores
'''
警告：Scikit-Learn 交叉验证功能期望的是效用函数（越大越好）而不是损失函数（越低
越好），因此得分函数实际上与 MSE 相反（即负值），这就是为什么前面的代码在计算
平方根之前先计算 -scores 。
'''
def display_scores(scores):
    print('Scores:',scores)
    print('Mean:',scores.mean())
    print('standard deviation:',scores.std())
    
display_scores(tree_rmse_scores)

'''
现在决策树就不像前面看起来那么好了。实际上，它看起来比线性回归模型还糟！注意到交
叉验证不仅可以让你得到模型性能的评估，还能测量评估的准确性（即，它的标准差）。决
策树的评分大约是 71200，通常波动有 ±3200。如果只有一个验证集，就得不到这些信息。
但是交叉验证的代价是训练了模型多次，不可能总是这样。
让我们计算下线性回归模型的的相同分数，以做确保：
'''
lin_scores = cross_val_score(lin_reg,housing_prepared,housing_labels,
                             scoring='neg_mean_squared_error',cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

display_scores(lin_rmse_scores)

'''
判断没错：决策树模型过拟合很严重，它的性能比线性回归模型还差。
现在再尝试最后一个模型： RandomForestRegressor 。第7章我们会看到，随机森林是通过用特
征的随机子集训练许多决策树。在其它多个模型之上建立模型称为集成学习（Ensemble
Learning），它是推进 ML 算法的一种好方法。我们会跳过大部分的代码，因为代码本质上和
其它模型一样：
'''
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)

housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels,housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

'''
现在好多了：随机森林看起来很有希望。但是，训练集的评分仍然比验证集的评分低很多。
解决过拟合可以通过简化模型，给模型加限制（即，规整化），或用更多的训练数据。在深
入随机森林之前，你应该尝试下机器学习算法的其它类型模型（不同核心的支持向量机，神
经网络，等等），不要在调节超参数上花费太多时间。目标是列出一个可能模型的列表（两
到五个）
提示：你要保存每个试验过的模型，以便后续可以再用。要确保有超参数和训练参数，
以及交叉验证评分，和实际的预测值。这可以让你比较不同类型模型的评分，还可以比
较误差种类。你可以用 Python 的模块 pickle ，非常方便地保存 Scikit-Learn 模型，或
使用 sklearn.externals.joblib ，后者序列化大 NumPy 数组更有效率：
from sklearn.externals import joblib
joblib.dump(my_model,'my_model.pkl')
my_model_loaded = joblib.lad('my_model.pkl')
'''
#模型微调
'''
网格搜索
微调的一种方法是手工调整超参数，直到找到一个好的超参数组合。这么做的话会非常冗
长，你也可能没有时间探索多种组合。
你应该使用 Scikit-Learn 的 GridSearchCV 来做这项搜索工作。你所需要做的是告
诉 GridSearchCV 要试验有哪些超参数，要试验什么值， GridSearchCV 就能用交叉验证试验所
有可能超参数值的组合。例如，下面的代码搜索了 RandomForestRegressor 超参数值的最佳组
合：
'''
from sklearn.model_selection import GridSearchCV
param_grid = [
        {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
        {'bootstrap':[False],'n_estimators':[3,10],'max_feature':[2,3,4]},
        ]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg,param_grid,cv=5,
                           scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared,housing_labels)

'''
当你不能确定超参数该有什么值，一个简单的方法是尝试连续的 10 的幂（如果想要一个
粒度更小的搜寻，可以用更小的数，就像在这个例子中对超参数 n_estimators 做的）。
'''
'''
param_grid 告诉 Scikit-Learn 首先评估所有的列在第一个 dict 中
的 n_estimators 和 max_features 的 3 × 4 = 12 种组合（不用担心这些超参数的含义，会在
第 7 章中解释）。然后尝试第二个 dict 中超参数的 2 × 3 = 6 种组合，这次会将超参
数 bootstrap 设为 False 而不是 True （后者是该超参数的默认值）。
总之，网格搜索会探索 12 + 6 = 18 种 RandomForestRegressor 的超参数组合，会训练每个模
型五次（因为用的是五折交叉验证）。换句话说，训练总共有 18 × 5 = 90 轮！K 折将要花费
大量时间，完成后，你就能获得参数的最佳组合，如下所示：
'''
grid_search.best_params_
'''
提示：因为 30 是 n_estimators 的最大值，你也应该估计更高的值，因为评估的分数可
能会随 n_estimators 的增大而持续提升。
你还能直接得到最佳的估计器：
'''
grid_search.best_estimator_

###################
from sklearn.model_selection import cross_val_score
forest_scores = cross_val_score(tree_reg,housing_prepared,housing_labels,
                         scoring='neg_mean_squared_error',cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

scores = cross_val_score(lin_reg,housing_prepared,housing_labels,scoring='neg_mean_squared_error')
pd.Series(np.sqrt(-scores)).describe()


from sklearn.svm import SVR
svm_reg = SVR(kernel='linear')
svm_reg.fit(housing_prepared,housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels,housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

#from sklearn.model_selection import GridSeachCV
from scipy.stats import geom,expon
geom_distrib = geom(0.5).rvs(10000,random_state=42)
expon_distrib = expon(scale=1).rvs(10000,random_state=42)
plt.hist(geom_distrib,bins=50)
plt.show()
plt.hist(expon_distrib,bins=50)
plt.show()

###课后习题
'''
尝试一个支持向量机回归器（ sklearn.svm.SVR ），使用多个超参数，比
如 kernel="linear" （多个超参数 C 值）。现在不用担心这些超参数是什么含义。最佳
的 SVR 预测表现如何
'''
#2
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon,reciprocal
param_distribs = {
        'kernel':['linear','rbf'],
        'C':reciprocal(20,200000),
        'gamma':expon(scale=1.0),
        }
svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg,param_distributions=param_distribs,
                                n_iter=50,cv=5,scoring='neg_mean_squared_error',
                                verbose=2,n_jobs=4,random_state=42)
rnd_search.fit(housing_prepared,housing_labels)
negative_mse = rnd_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

rnd_search.best_params_

expon_distrib = expon(scale=1.)
samples = expon_distrib.rvs(10000,random_state=42)
plt.subplot(121)
plt.title('Exponential ditrbution (scale=1.0)')
plt.hist(samples,bins=50)
plt.subplot(122)
plt.title('Log od this distribution')
plt.hist(np.log(samples),bins=50)
plt.show()

reciprocal_distrib = reciprocal(20,200000)
samples = reciprocal_distrib.rvs(10000,random_state=42)
plt.figure(figsize=(10,4))
plt.subplot(121)
plt.title('Reciprocal distribution(scale=1.0)')
plt.hist(samples,bins=50)
plt.subplt(122)
plt.title('Log of this distribution')
plt.hist(np.log(samples),bins=50)
plt.show()

#3
'''
尝试在准备流水线中添加一个只选择最重要属性的转换器
'''
from sklearn.base import BaseEstimator,TransformerMixin
def indices_of_top_k(arr,k):
    return np.sort(np.argpartition(np.array(arr),-k)[-k:])

class TopFeatureSelector(BaseEstimator,TransformerMixin):
    def __init__(self,feature_importance,k):
        self.feature_importances = feature_importance
        self.k = k
    def fit(self,X,y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances,self.k)
        return self
    def transform(self,X):
        return X[:,self.feature_indices_]

#??????????????如何引用类里面的变量
top_k_feature_indices = indices_of_top_k(feature_importances,k)
top_k_feature_indices

np.array(attributes)[top_k_feature_indices]

sorted(zip(feature_importance,attributes),reverse=True)[:k]

preparation_and_feature_selection_pipeline = Pipeline([
        ('preparation',full_pipeline),
        ('feature_selection',TopFeatureSelector(feature_importances,k))
        ])

housing_prepared_top_k_features = preparation_and_feature_selection_pipelin.fit_transform(housing)
housing_prepared_top_k_features[0:3]    


#4尝试创建一个单独的可以完成数据准备和最终预测的流水线。
prepare_select_and_predict_pipeline = Pipeline([
        ('preparation',full_pipeline),
        ('feature_selection',TopFeatureSelector(feature_importances,k)),
        ('svm_reg',SVR(**rnd_search.best_params_))
        ])
prepare_select_and_predict_pipeline.fit(housing,housing_labels)

some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print('Predictions:\t',prepare_select_and_predict_pipeline.predict(some_data))
print('Labels:\t\t',list(some_labels))

#5使用 GridSearchCV 自动探索一些准备过程中的候选项。
param_grid = [
        {'preparation_num_imputer_strategy':['mean','median','most_frequent'],
         'feature_selection_k':list(range(1,len(feature_importances)+1))}
        ]
grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline,param_grid,cv=5,
                                scoring='neg_mean_squared_error',verbose=2,n_jobs=4)
grid_search_prep.fit(housing,housing_labels)

grid_search_prep.best_params_
wifi连不上
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【机器学习实战】2_predict_median_house_values代码《Hands-On Machine Learning with Scikit-Learn&TensorFlow》

# -*- coding: utf-8 -*-"""Created on Mon Dec 17 14:53:17 2018@author: Administrator"""#from __future__ import division,print_fuction,unicode_literals#os 模块提供了非常丰富的方法用来处理文件和目录。import os#impor...
复制链接

扫一扫