机器学习实战(jupyter)/第一部分第二章/Pipeline的使用

简单应用

from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

#就是把几个数据处理的操作组装起来,按顺序走
pl_svm = Pipeline([
    ('normal',Normalizer()),    #归一化
    ('PCA',PCA()),              #降维
    ('SVC',SVC())               #支持向量机分类
])

X_train = [[1,2],[2,3],[-1,-3],[-5,-6],[6,6],[-2,-1]]
Y_train = [1,1,0,0,1,0]
clf = pl_svm.fit(X_train,Y_train)
print(clf.predict(X_train))

X_test = [[2,2],[10,10],[-10,-10],[-5,-9]]
print(clf.predict(X_test))




再看看在机器学习实战/第一部分第二章程序里的应用

导入库并加载数据

# 加载库
import os
import tarfile
import numpy as np
import pandas as pd
from six.moves import urllib
#DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
#HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

#当数据会定期发送变换,还可以写个小脚本。
# def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
#     os.makedirs(housing_path, exist_ok=True)
#     tgz_path = os.path.join(housing_path, "housing.tgz")
#     urllib.request.urlretrieve(housing_url, tgz_path)
#     housing_tgz = tarfile.open(tgz_path)
#     housing_tgz.extractall(path=housing_path)
#     housing_tgz.close()
# fetch_housing_data()

# 加载数据
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path=os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

查看数据并将特征分为数值特征和分类特征

# 查看数据结构
housing = load_housing_data()
housing.head() # 查看前五行的信息
housing_cat = housing[['ocean_proximity']] 
housing_num = housing.select_dtypes(include=[np.number])

在这里插入图片描述

设计转换器

转换器功能:添加新的特征bedrooms_per_room和bedrooms_per_room


from sklearn.base import BaseEstimator, TransformerMixin  

# get the right column indices: safer than hard-coding indices 3, 4, 5, 6
#取需要运算的数据
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):   #组合属性加法器
    
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kwargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,    #把数组array1和数组array2配对后输出
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

后面pipeline调用的是FunctionTransformer,但需要前面那段程序的返回值

from sklearn.preprocessing import FunctionTransformer

def add_extra_features(X, add_bedrooms_per_room=True):            #增加其他特征函数      add_bedrooms_per_room 标志
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]     #每户房间数
    population_per_household = X[:, population_ix] / X[:, household_ix]    #每户人数
    if add_bedrooms_per_room:                                  
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]           #房间中卧室占比
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]
    
attr_adder = FunctionTransformer(add_extra_features,      #参数func:  自定义函数,及增加其他特征
                        validate=False,          #validate: bool量, default=False  输入验证关
                        kw_args={"add_bedrooms_per_room": False}) #kw_argsdict, default=None要传递给 func 的附加关键字参数的字典。 

使用pipeline处理数值特征

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler   #估算器

num_pipeline = Pipeline([                                   #制作针对数字特征的转换器
        ('imputer', SimpleImputer(strategy="median")),      #将缺失值替换成中位数
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),  #加入特征
        ('std_scaler', StandardScaler()),    #标准化:减去均值,然后除以标准差
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr

在这里插入图片描述

使用列转换器ColumnTransformer处理所有特征

from sklearn.compose import ColumnTransformer      #列转换器
from sklearn.preprocessing import OneHotEncoder #OneHot编码器
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),          #对数字特征应用数字特征pipeline转换器
        ("cat", OneHotEncoder(), cat_attribs),       #对分类特征使用OneHot编码器,即特征“靠近海边”分列用布尔量表示
    ])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

在这里插入图片描述

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值