分类数据填充、数值数据填充、无序分类数据变成dummpy类型,有序分类数据变成label类型,数值类型切分。要将这些动作结合在一起,应该怎么做?
将这些动作都定义为一个含有fit和transform的类,然后使用Pipeline集成起来,因为Pipeline需要fit和transform方法。
基本思想都是这样的,构造函数继承TransformerMixin 构造函数传入列名还有一些参数。transform方法返回一个改造后的新的DataFrame。
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.preprocessing import Imputer
X = pd.DataFrame({'city':['tokyo', None, 'london', 'seattle', 'san francisco', 'tokyo'],
'boolean':['yes', 'no', None, 'no', 'no', 'yes'],
'ordinal_column':['somewhat like', 'like', 'somewhat like', 'like', 'somewhat like', 'dislike'],
'quantitative_column':[1, 11, -.5, 10, None, 20]})
class CustomCategoryImputer(TransformerMixin):
def __init__(self, cols=None):
self.cols = cols
def transform(self, df):
X = df.copy()
for col in self.cols:
X[col].fillna(X[col].value_counts().index[0], inplace=True)
return X
def fit(self, *_):
return self
cci = CustomCategoryImputer(cols=['city', 'boolean'])
class CustomQuantitativeImputer(TransformerMixin):
def __init__(self, cols=None, strategy='mean'):
self.cols = cols
self.strategy = strategy
def transform(self, df):
X = df.copy()
impute = Imputer(strategy=self.strategy)
for col in self.cols:
X[col] = impute.fit_transform(X[[col]])
return X
def fit(self, *_):
return self
cqi = CustomQuantitativeImputer(cols=['quantitative_column'], strategy='mean')
#是使用cci和cqi 集成一个imputer
imputer = Pipeline([('cci', cci),('cqi', cqi)])
class CustomDummifier(TransformerMixin):
def __init__(self, cols=None):
self.cols = cols
def transform(self, X):
return pd.get_dummies(X, columns=self.cols)
def fit(self, *_):
return self
cd = CustomDummifier(cols=['boolean', 'city'])
class CustomEncoder(TransformerMixin):
def __init__(self, col, ordering = None):
self.col = col
self.ordering = ordering
def transform(self, df):
x = df.copy()
x[self.col] = x[self.col].map(lambda x : self.ordering.index(x))
return x
def fit(self, *_):
return self
ce = CustomEncoder(col='ordinal_column', ordering = ['dislike', 'somewhat like',
'like'])
class CustomCutter(TransformerMixin):
def __init__(self, col, bins=3, labels=False):
self.col = col
self.bins = bins
self.labels = labels
def transform(self, df):
x = df.copy()
x[self.col] = pd.cut(x[self.col], bins=self.bins, labels=self.labels)
return x
def fit(self, *_):
return self
cc = CustomCutter(col='quantitative_column', bins=3)
pipe = Pipeline([("imputer", imputer), ('dummify', cd), ('encode', ce), ('cut', cc)])
print(X)
pipe.fit(X)
print(pipe.transform(X))