第四章 pipeline集成一系列数据处理动作

分类数据填充、数值数据填充、无序分类数据变成dummpy类型,有序分类数据变成label类型,数值类型切分。要将这些动作结合在一起,应该怎么做?

将这些动作都定义为一个含有fit和transform的类,然后使用Pipeline集成起来,因为Pipeline需要fit和transform方法。

基本思想都是这样的,构造函数继承TransformerMixin 构造函数传入列名还有一些参数。transform方法返回一个改造后的新的DataFrame。

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.preprocessing import Imputer

X = pd.DataFrame({'city':['tokyo', None, 'london', 'seattle', 'san francisco', 'tokyo'],
                  'boolean':['yes', 'no', None, 'no', 'no', 'yes'],
                  'ordinal_column':['somewhat like', 'like', 'somewhat like', 'like', 'somewhat like', 'dislike'],
                  'quantitative_column':[1, 11, -.5, 10, None, 20]})



class CustomCategoryImputer(TransformerMixin):

    def __init__(self, cols=None):
        self.cols = cols

    def transform(self, df):
        X = df.copy()
        for col in self.cols:
            X[col].fillna(X[col].value_counts().index[0], inplace=True)
        return X

    def fit(self, *_):
        return self

cci = CustomCategoryImputer(cols=['city', 'boolean'])

class CustomQuantitativeImputer(TransformerMixin):
    def __init__(self, cols=None, strategy='mean'):
        self.cols = cols
        self.strategy = strategy

    def transform(self, df):
        X = df.copy()
        impute = Imputer(strategy=self.strategy)
        for col in self.cols:
            X[col] = impute.fit_transform(X[[col]])
        return X

    def fit(self, *_):
        return self

cqi = CustomQuantitativeImputer(cols=['quantitative_column'], strategy='mean')

#是使用cci和cqi 集成一个imputer
imputer = Pipeline([('cci', cci),('cqi', cqi)])

class CustomDummifier(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols

    def transform(self, X):
        return pd.get_dummies(X, columns=self.cols)

    def fit(self, *_):
        return self
cd = CustomDummifier(cols=['boolean', 'city'])

class CustomEncoder(TransformerMixin):

    def __init__(self, col, ordering = None):
        self.col = col
        self.ordering = ordering

    def transform(self, df):
        x = df.copy()
        x[self.col] = x[self.col].map(lambda x : self.ordering.index(x))
        return x

    def fit(self, *_):
        return  self

ce = CustomEncoder(col='ordinal_column', ordering = ['dislike', 'somewhat like',
                                                     'like'])



class CustomCutter(TransformerMixin):

    def __init__(self, col, bins=3, labels=False):
        self.col = col
        self.bins = bins
        self.labels = labels

    def transform(self, df):
        x = df.copy()
        x[self.col] = pd.cut(x[self.col], bins=self.bins, labels=self.labels)

        return x

    def fit(self, *_):
        return self

cc = CustomCutter(col='quantitative_column', bins=3)
pipe = Pipeline([("imputer", imputer), ('dummify', cd), ('encode', ce), ('cut', cc)])
print(X)
pipe.fit(X)
print(pipe.transform(X))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值