import pandas as pd
from sklearn.base import TransformerMixin
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
X = pd.DataFrame({'city':['tokyo',None,'london','seattle','san francisco','tokyo'],'boolean':['yes','no',None,'no','no','yes'],'ordinal_column':['somewhat like','like','somewhat like','like','somewhat like','dislike'],'quantitative_column':[1,11,-.5,10,None,20]})
print(X)
city boolean ordinal_column quantitative_column
0 tokyo yes somewhat like 1.0
1 None no like 11.0
2 london None somewhat like -0.5
3 seattle no like 10.0
4 san francisco no somewhat like NaN
5 tokyo yes dislike 20.0
X['ordinal_column'].hist()
<AxesSubplot:>
X.isnull().sum()
city 1
boolean 1
ordinal_column 0
quantitative_column 1
dtype: int64
X['city'].value_counts().index[0]#获取某一列最常见的元素
'tokyo'
自定义填充器
from sklearn.base import TransformerMixin
classCustomCategoryImputer(TransformerMixin):def__init__(self,cols =None):
self.cols = cols
deftransform(self,df):
X = df.copy()for col in self.cols:
X[col].fillna(X[col].value_counts().index[0],inplace =True)return X
deffit(self,*_):return self
import pprint
pprint.pprint(X)
city boolean ordinal_column quantitative_column
0 tokyo yes somewhat like 1.0
1 None no like 11.0
2 london None somewhat like -0.5
3 seattle no like 10.0
4 san francisco no somewhat like NaN
5 tokyo yes dislike 20.0
cci = CustomCategoryImputer(cols =['city','boolean'])
cci.fit_transform(X)
city
boolean
ordinal_column
quantitative_column
0
tokyo
yes
somewhat like
1.0
1
tokyo
no
like
11.0
2
london
no
somewhat like
-0.5
3
seattle
no
like
10.0
4
san francisco
no
somewhat like
NaN
5
tokyo
yes
dislike
20.0
自定义定量填充器
from sklearn.impute import SimpleImputer
classCustomQuantitativeImputer(TransformerMixin):def__init__(self,cols =None, strategy =None):
self.cols = cols
self.strategy = strategy
deftransform(self,df):
X = df.copy()
impute = SimpleImputer(strategy = self.strategy)for col in self.cols:
X[col]= impute.fit_transform(X[[col]])return X
deffit(self,*_):return self
city boolean ordinal_column quantitative_column
0 tokyo yes somewhat like 1.0
1 None no like 11.0
2 london None somewhat like -0.5
3 seattle no like 10.0
4 san francisco no somewhat like NaN
5 tokyo yes dislike 20.0