Table of Contents
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
% matplotlib inline
plt. rcParams[ "font.sans-serif" ] = [ "FangSong" ]
plt. rcParams[ "axes.unicode_minus" ] = False
import warnings
warnings. filterwarnings( "ignore" )
import category_encoders as ce
非监督不分裂 :‘OrdinalEncoder’,‘CountEncoder’ 监督不分裂:‘TargetEncoder’,’ LeaveOneOutEncoder’,‘CatBoostEncoder’,‘WOEEncoder’ 非监督分裂:‘OneHotEncoder’,‘Binary Encoder’
dir ( ce)
['BackwardDifferenceEncoder',
'BaseNEncoder',
'BinaryEncoder',
'CatBoostEncoder',
'CountEncoder',
'GLMMEncoder',
'HashingEncoder',
'HelmertEncoder',
'JamesSteinEncoder',
'LeaveOneOutEncoder',
'MEstimateEncoder',
'OneHotEncoder',
'OrdinalEncoder',
'PolynomialEncoder',
'SumEncoder',
'TargetEncoder',
'WOEEncoder',
'__all__',
'__author__',
'__builtins__',
'__cached__',
'__doc__',
'__file__',
'__loader__',
'__name__',
'__package__',
'__path__',
'__spec__',
'__version__',
'backward_difference',
'basen',
'binary',
'cat_boost',
'count',
'glmm',
'hashing',
'helmert',
'james_stein',
'leave_one_out',
'm_estimate',
'one_hot',
'ordinal',
'polynomial',
'sum_coding',
'target_encoder',
'utils',
'woe']
X = pd. DataFrame( np. array( [ [ 'male' , 10 ] , [ 'female' , 20 ] , [ 'male' , 10 ] ,
[ 'female' , 20 ] , [ 'female' , 10 ] , [ 'female' , 30 ] , [ 'male' , 10 ] ] ) ,
columns = [ 'Sex' , 'Type' ] )
y = np. array( [ 0 , 1 , 1 , 0 , 1 , 0 , 1 ] )
X
Sex Type 0 male 10 1 female 20 2 male 10 3 female 20 4 female 10 5 female 30 6 male 10
OrdinalEncoder 序列编码
相当于sklearn中的LabelEncode
然而不是很好编码缺失值,应之前填充
encoder = ce. OrdinalEncoder( cols = [ 'Sex' , 'Type' ] ) . fit( X, y)
encoder
OrdinalEncoder(cols=['Sex', 'Type'],
mapping=[{'col': 'Sex', 'data_type': dtype('O'),
'mapping': male 1
female 2
NaN -2
dtype: int64},
{'col': 'Type', 'data_type': dtype('O'),
'mapping': 10 1
20 2
30 3
NaN -2
dtype: int64}])
encoder. transform( X)
Sex Type 0 1 1 1 2 2 2 1 1 3 2 2 4 2 1 5 2 3 6 1 1
OneHotEncoder 独热编码
相当于pandas中get_dummies
encoder = ce. OneHotEncoder( cols = [ 'Sex' , 'Type' ] , drop_invariant= True ) . fit( X, y)
encoder. transform( X)
Sex_1 Sex_2 Type_1 Type_2 Type_3 0 1 0 1 0 0 1 0 1 0 1 0 2 1 0 1 0 0 3 0 1 0 1 0 4 0 1 1 0 0 5 0 1 0 0 1 6 1 0 1 0 0
TargetEncoder 目标编码
link:https://zhuanlan.zhihu.com/p/119093636 [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-WOwbyUfY-1603970994566)(attachment:image.png)]
encoder = ce. TargetEncoder( cols = [ 'Sex' , 'Type' ] , drop_invariant= True ) . fit( X, y)
encoder. transform( X)
Sex Type 0 0.655314 0.741531 1 0.503388 0.519210 2 0.655314 0.741531 3 0.503388 0.519210 4 0.503388 0.741531 5 0.503388 0.571429 6 0.655314 0.741531
Binary Encoder二进制编码
encoder = ce. BinaryEncoder( cols = [ 'Sex' , 'Type' ] ) . fit( X, y)
encoder. transform( X)
Sex_0 Sex_1 Type_0 Type_1 Type_2 0 0 1 0 0 1 1 1 0 0 1 0 2 0 1 0 0 1 3 1 0 0 1 0 4 1 0 0 0 1 5 1 0 0 1 1 6 0 1 0 0 1
BaseNEncoder 贝叶斯编码
encoder = ce. BaseNEncoder( cols = [ 'Sex' , 'Type' ] ) . fit( X, y)
encoder. transform( X)
Sex_0 Sex_1 Type_0 Type_1 Type_2 0 0 1 0 0 1 1 1 0 0 1 0 2 0 1 0 0 1 3 1 0 0 1 0 4 1 0 0 0 1 5 1 0 0 1 1 6 0 1 0 0 1
LeaveOneOutEncoder 留一法
类似目标编码
encoder = ce. LeaveOneOutEncoder( cols = [ 'Sex' , 'Type' ] ) . fit( X, y)
encoder. transform( X)
Sex Type 0 0.666667 0.750000 1 0.500000 0.500000 2 0.666667 0.750000 3 0.500000 0.500000 4 0.500000 0.750000 5 0.500000 0.571429 6 0.666667 0.750000
HashingEncoder 哈希编码
encoder = ce. HashingEncoder( cols = [ 'Sex' , 'Type' ] ) . fit( X, y)
encoder. transform( X)
col_0 col_1 col_2 col_3 col_4 col_5 col_6 col_7 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 2 1 0 0 0 0 1 0 0 3 0 0 0 0 1 1 0 0 4 1 0 0 0 0 1 0 0 5 0 0 0 0 0 1 0 1 6 1 0 0 0 0 1 0 0
CatBoostEncoder catboost目标编码
encoder = ce. CatBoostEncoder( cols = [ 'Sex' , 'Type' ] ) . fit( X, y)
encoder. transform( X)
Sex Type 0 0.642857 0.714286 1 0.514286 0.523810 2 0.642857 0.714286 3 0.514286 0.523810 4 0.514286 0.714286 5 0.514286 0.571429 6 0.642857 0.714286
CountEncoder 频率编码
encoder = ce. CountEncoder( cols = [ 'Sex' , 'Type' ] ) . fit( X, y)
encoder. transform( X)
Sex Type 0 3 4 1 4 2 2 3 4 3 4 2 4 4 4 5 4 1 6 3 4
WOEEncoder 证据权重编码
encoder = ce. WOEEncoder( cols = [ 'Sex' , 'Type' ] ) . fit( X, y)
encoder. transform( X)
Sex Type 0 0.223144 0.510826 1 -0.182322 -0.182322 2 0.223144 0.510826 3 -0.182322 -0.182322 4 -0.182322 0.510826 5 -0.182322 0.000000 6 0.223144 0.510826
dir ( ce)
['BackwardDifferenceEncoder',
'BaseNEncoder',
'BinaryEncoder',
'CatBoostEncoder',
'CountEncoder',
'GLMMEncoder',
'HashingEncoder',
'HelmertEncoder',
'JamesSteinEncoder',
'LeaveOneOutEncoder',
'MEstimateEncoder',
'OneHotEncoder',
'OrdinalEncoder',
'PolynomialEncoder',
'SumEncoder',
'TargetEncoder',
'WOEEncoder',
'__all__',
'__author__',
'__builtins__',
'__cached__',
'__doc__',
'__file__',
'__loader__',
'__name__',
'__package__',
'__path__',
'__spec__',
'__version__',
'backward_difference',
'basen',
'binary',
'cat_boost',
'count',
'glmm',
'hashing',
'helmert',
'james_stein',
'leave_one_out',
'm_estimate',
'one_hot',
'ordinal',
'polynomial',
'sum_coding',
'target_encoder',
'utils',
'woe']