特征工程3

第四章:特征构建:生成新的特征

import pandas as pd
from sklearn.base import TransformerMixin
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
X = pd.DataFrame({'city':['tokyo', None, 'london', 'seattle', 'san francisco', 
                  'tokyo'], 
                  'boolean':['yes', 'no', None, 'no', 'no', 'yes'], 
                  'ordinal_column':['somewhat like', 'like', 'somewhat like','like','somewhat like', 'dislike'], 
                  'quantitative_column':[1, 11, -.5, 10, None, 20]}) 
print(X)
            city boolean ordinal_column  quantitative_column
0          tokyo     yes  somewhat like                  1.0
1           None      no           like                 11.0
2         london    None  somewhat like                 -0.5
3        seattle      no           like                 10.0
4  san francisco      no  somewhat like                  NaN
5          tokyo     yes        dislike                 20.0
X['ordinal_column'].hist()
<AxesSubplot:>


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Q1ZeHR2Y-1614179523073)(output_4_1.png)]

X.isnull().sum()
city                   1
boolean                1
ordinal_column         0
quantitative_column    1
dtype: int64
X['city'].value_counts().index[0]   #获取某一列最常见的元素
'tokyo'

自定义填充器

from sklearn.base import TransformerMixin

class CustomCategoryImputer(TransformerMixin):
    def __init__(self,cols = None):
        self.cols = cols
        
    def transform(self,df):
        X = df.copy()
        for col in self.cols:
            X[col].fillna(X[col].value_counts().index[0],inplace = True)
        return X
    
    def fit(self,*_):
        return self
import pprint

pprint.pprint(X)
            city boolean ordinal_column  quantitative_column
0          tokyo     yes  somewhat like                  1.0
1           None      no           like                 11.0
2         london    None  somewhat like                 -0.5
3        seattle      no           like                 10.0
4  san francisco      no  somewhat like                  NaN
5          tokyo     yes        dislike                 20.0
cci = CustomCategoryImputer(cols = ['city','boolean'])
cci.fit_transform(X)
citybooleanordinal_columnquantitative_column
0tokyoyessomewhat like1.0
1tokyonolike11.0
2londonnosomewhat like-0.5
3seattlenolike10.0
4san francisconosomewhat likeNaN
5tokyoyesdislike20.0

自定义定量填充器

from sklearn.impute import SimpleImputer

class CustomQuantitativeImputer(TransformerMixin):
    def __init__(self,cols = None, strategy = None):
        self.cols = cols
        self.strategy = strategy
        
    def transform(self,df):
        X = df.copy()
        impute = SimpleImputer(strategy = self.strategy)
        for col in self.cols:
            X[col] = impute.fit_transform(X[[col]])
        return X
    
    def fit(self,*_):
        return self
                                            
cqi = CustomQuantitativeImputer(cols = ['quantitative_column'],strategy='median')
cqi.fit_transform(X)
citybooleanordinal_columnquantitative_column
0tokyoyessomewhat like1.0
1Nonenolike11.0
2londonNonesomewhat like-0.5
3seattlenolike10.0
4san francisconosomewhat like10.0
5tokyoyesdislike20.0
# 使用流水线的方式

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

imputer = Pipeline([('quant',cqi),('category',cci)])
imputer.fit_transform(X)
citybooleanordinal_columnquantitative_column
0tokyoyessomewhat like1.0
1tokyonolike11.0
2londonnosomewhat like-0.5
3seattlenolike10.0
4san francisconosomewhat like10.0
5tokyoyesdislike20.0
X
citybooleanordinal_columnquantitative_column
0tokyoyessomewhat like1.0
1Nonenolike11.0
2londonNonesomewhat like-0.5
3seattlenolike10.0
4san francisconosomewhat likeNaN
5tokyoyesdislike20.0

编码分类变量

定类等级的编码

  • 主要方式是将分类数据转换为虚拟变量
# 自定义虚拟化器

class CustomDummifier(TransformerMixin):
    def __init__(self,cols = None):
        self.cols = cols 
    
    def transform(self,df):
        return pd.get_dummies(X,columns = self.cols)
    
    def fit(self,*_):
        return self
cd = CustomDummifier(cols = ['boolean','city'])
cd.fit_transform(X)
ordinal_columnquantitative_columnboolean_noboolean_yescity_londoncity_san franciscocity_seattlecity_tokyo
0somewhat like1.0010001
1like11.0100000
2somewhat like-0.5001000
3like10.0100010
4somewhat likeNaN100100
5dislike20.0010001
### 定序等级的编码

class CustomerEncoder(TransformerMixin):
    def __init__(self,col,ordering = None):
        self.col = col
        self.ordering = ordering
        
    def transform(self,df):
        X = df.copy()
        X[self.col] = X[self.col].map(lambda x : self.ordering.index(x))
        return X 
    
    def fit(self,*_):
        return self
ce = CustomerEncoder(col='ordinal_column', ordering = ['dislike', 'somewhat like',
'like'])
ce.fit_transform(X)
citybooleanordinal_columnquantitative_column
0tokyoyes11.0
1Noneno211.0
2londonNone1-0.5
3seattleno210.0
4san franciscono1NaN
5tokyoyes020.0

连续特征分箱

pd.cut(X['quantitative_column'],bins =3)
0     (-0.52, 6.333]
1    (6.333, 13.167]
2     (-0.52, 6.333]
3    (6.333, 13.167]
4                NaN
5     (13.167, 20.0]
Name: quantitative_column, dtype: category
Categories (3, interval[float64]): [(-0.52, 6.333] < (6.333, 13.167] < (13.167, 20.0]]
pd.cut(X['quantitative_column'],bins = 3, labels = False)
0    0.0
1    1.0
2    0.0
3    1.0
4    NaN
5    2.0
Name: quantitative_column, dtype: float64
class CustomCutter(TransformerMixin):
    def __init__(self, col, bins, labels=False):
        self.labels = labels
        self.bins = bins 
        self.col = col

    def transform(self, df):
        X = df.copy()
        X[self.col] = pd.cut(X[self.col], bins=self.bins, labels=self.labels)
        return X

    def fit(self, *_):
        return self 
cc = CustomCutter(col='quantitative_column', bins = 3)

cc.fit_transform(X) 
citybooleanordinal_columnquantitative_column
0tokyoyessomewhat like0.0
1Nonenolike1.0
2londonNonesomewhat like0.0
3seattlenolike1.0
4san francisconosomewhat likeNaN
5tokyoyesdislike2.0

创建流水线

pipeline = Pipeline([('quant',cqi),('category',cci),('dummify',cd),('encode',ce),('cut',cc)])

print(X)
            city boolean ordinal_column  quantitative_column
0          tokyo     yes  somewhat like                  1.0
1           None      no           like                 11.0
2         london    None  somewhat like                 -0.5
3        seattle      no           like                 10.0
4  san francisco      no  somewhat like                  NaN
5          tokyo     yes        dislike                 20.0
pipeline.fit_transform(X)
ordinal_columnquantitative_columnboolean_noboolean_yescity_londoncity_san franciscocity_seattlecity_tokyo
010.0010001
121.0100000
210.0001000
321.0100010
41NaN100100
502.0010001

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值