1. 离散变量编码
import pandas as pd
data = pd. read_csv( './data_preprocessing.csv' , index_col= 0 )
data_ = data. copy( )
1.1 标签专用 sklearn.preprocessing.LabelEncoder
1.1.1 单列编码
from sklearn. preprocessing import LabelEncoder
y = data_. iloc[ : , - 1 ]
le = LabelEncoder( )
label = le. fit_transform( y)
data_. iloc[ : , - 1 ] = label
data_
Age Sex Embarked Survived 0 22.0 male S 0 1 38.0 female C 2 2 26.0 female S 2 3 NaN female S 2 4 35.0 male S 0 5 NaN male Q 1 6 58.0 male Q 0 7 20.0 female C 2 8 2.0 female S 1
1.1.2 多标签特征同时编码(封装类方式)
from sklearn. preprocessing import LabelEncoder
class MultiColumnLabelEncoder :
def __init__ ( self, columns = None ) :
self. columns = columns
def fit ( self, X, y= None ) :
return self
def transform ( self, X) :
'''
如果指定了待编码的特证名self.columns,则按照X中特征编码;
如果未指定,则遍历传入DataFrame的所有特征依次进行编码。
'''
output = X. copy( )
if self. columns is not None :
for col in self. columns:
output[ col] = LabelEncoder( ) . fit_transform( output[ col] )
else :
for colname, col in output. iteritems( ) :
output[ colname] = LabelEncoder( ) . fit_transform( col)
return output
def fit_transform ( self, X, y= None ) :
return self. fit( X, y) . transform( X)
label_encoder = MultiColumnLabelEncoder( columns= columns_with_strings_as_values)
train = label_encoder. fit_transform( train)
test = label_encoder. transform( test)
1.2 特征专用(不能是一维) sklearn.preprocessing.OrdinalEncoder
from sklearn. preprocessing import OrdinalEncoder
data_. iloc[ : , 1 : - 1 ] = OrdinalEncoder( ) . fit_transform( data_. iloc[ : , 1 : - 1 ] )
data_
Age Sex Embarked Survived 0 22.0 1.0 2.0 0 1 38.0 0.0 0.0 2 2 26.0 0.0 2.0 2 3 NaN 0.0 2.0 2 4 35.0 1.0 2.0 0 5 NaN 1.0 1.0 1 6 58.0 1.0 1.0 0 7 20.0 0.0 0.0 2 8 2.0 0.0 2.0 1
1.3 独热编码(离散变量编码) sklearn.preprocessing.OneHotEncoder
1.3.1 原理 & 过程
'''
名义变量:没有联系,不可互相计算。OneHotEncoder。舱门(S,C,Q)、性别;
有序变量:有联系,不可互相计算。OrdinalEncoder。学历(小学,高中,大学);
有距变量:有联系,可互相计算。重量(2kg,4kg,6kg).
单纯转为(1,2,3),自带数学性质影响建模。
OneHotEncoder独热编码,名义变量>>>哑变量
'S'[0, 'S'[[1,0,0],
'Q' 1, >>> 'Q' [0,1,0],
'C' 2] 'C' [0,0,1]]
'''
'''
二分类离散变量,转换后知到一列取值已知则另一列取值也确定
OneHotEncoder(drop='if_binary') 跳过二分类,只对多分类离散变量进行转化
ID Gender ID Gender_F Gender_M
1 F 1 1 0
2 M >>> 2 0 1
3 M 3 0 1
4 F 4 1 0
ID Gender Income ID Gender Income_High Income_medium Income_Low
1 F High 1 0 1 0 0
2 M Medium >>> 2 1 0 1 0
3 M High 3 1 1 0 0
4 F Low 4 0 0 0 1
'''
X = pd. DataFrame( { 'Gender' : [ 'F' , 'M' , 'M' , 'F' ] ,
'Income' : [ 'High' , 'Medium' , 'High' , 'Low' ] } )
X
Gender Income 0 F High 1 M Medium 2 M High 3 F Low
from sklearn. preprocessing import OneHotEncoder
enc = OneHotEncoder( drop= 'if_binary' )
enc. fit_transform( X) . toarray( )
'''
array([[0., 1., 0., 0.],
[1., 0., 0., 1.],
[1., 1., 0., 0.],
[0., 0., 1., 0.]])
'''
'''
二分类 F >>> 0,M >>> 1
多分类 第一列High,第二列Low,第三列Medium
'''
enc. categories_
'''
[array(['F', 'M'], dtype=object),
array(['High', 'Low', 'Medium'], dtype=object)]
'''
cate_cols = X. columns. tolist( )
cate_cols
'''
['Gender', 'Income']
'''
cate_cols_new = [ ]
for idx, colname in enumerate ( cate_cols) :
if len ( enc. categories_[ idx] ) == 2 :
cate_cols_new. append( colname)
else :
for f in enc. categories_[ idx] :
feature_name = colname + '_' + f
cate_cols_new. append( feature_name)
cate_cols_new
'''
['Gender', 'Income_High', 'Income_Low', 'Income_Medium']
'''
pd. DataFrame( enc. fit_transform( X) . toarray( )
, columns= cate_cols_new)
Gender Income_High Income_Low Income_Medium 0 0.0 1.0 0.0 0.0 1 1.0 0.0 0.0 1.0 2 1.0 1.0 0.0 0.0 3 0.0 0.0 1.0 0.0
1.3.2 封装函数
def cate_colName ( Transformer, category_cols, drop= 'if_binary' ) :
"""
离散字段独热编码后字段名创建函数
:param Transformer: 独热编码转化器
:param category_cols: 原始列名
:param drop: 独热编码转化器的drop参数
"""
cate_cols_new = [ ]
col_value = Transformer. categories_
for idx, colname in enumerate ( cate_cols) :
if ( len ( col_value[ idx] ) == 2 ) & ( drop == 'if_binary' ) :
cate_cols_new. append( colname)
else :
for f in col_value[ idx] :
feature_name = colname + '_' + f
cate_cols_new. append( feature_name)
return ( cate_cols_new)
cate_colName( enc, cate_cols)
'''
['Gender', 'Income_High', 'Income_Low', 'Income_Medium']
'''
1.3.3 多维数据编码
from sklearn. preprocessing import OneHotEncoder
X = data. iloc[ : , 1 : - 1 ]
enc = OneHotEncoder( categories= 'auto' ) . fit( X)
result = enc. transform( X) . toarray( )
result
'''
array([[0., 1., 0., 0., 1.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.],
[1., 0., 0., 0., 1.],
[0., 1., 0., 0., 1.],
[0., 1., 0., 1., 0.],
[0., 1., 0., 1., 0.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.]])
'''
newdata = pd. concat( [ data, pd. DataFrame( result) ] , axis= 1 )
newdata. drop( [ 'Sex' , 'Embarked' ] , axis= 1 , inplace= True )
newdata
Age Survived 0 1 2 3 4 0 22.0 No 0.0 1.0 0.0 0.0 1.0 1 38.0 Yes 1.0 0.0 1.0 0.0 0.0 2 26.0 Yes 1.0 0.0 0.0 0.0 1.0 3 NaN Yes 1.0 0.0 0.0 0.0 1.0 4 35.0 No 0.0 1.0 0.0 0.0 1.0 5 NaN Unknown 0.0 1.0 0.0 1.0 0.0 6 58.0 No 0.0 1.0 0.0 1.0 0.0 7 20.0 Yes 1.0 0.0 1.0 0.0 0.0 8 2.0 Unknown 1.0 0.0 0.0 0.0 1.0
print ( enc. get_feature_names( ) )
newdata. columns = [ 'Age' , 'Survived' , 'female' , 'male' , 'Embarked_C' , 'Embarked_Q' , 'Embarked_S' ]
newdata
[‘x0_female’ ‘x0_male’ ‘x1_C’ ‘x1_Q’ ‘x1_S’]
Age Survived female male Embarked_C Embarked_Q Embarked_S 0 22.0 No 0.0 1.0 0.0 0.0 1.0 1 38.0 Yes 1.0 0.0 1.0 0.0 0.0 2 26.0 Yes 1.0 0.0 0.0 0.0 1.0 3 NaN Yes 1.0 0.0 0.0 0.0 1.0 4 35.0 No 0.0 1.0 0.0 0.0 1.0 5 NaN Unknown 0.0 1.0 0.0 1.0 0.0 6 58.0 No 0.0 1.0 0.0 1.0 0.0 7 20.0 Yes 1.0 0.0 1.0 0.0 0.0 8 2.0 Unknown 1.0 0.0 0.0 0.0 1.0
1.3.4 一维数据编码 .reshape(-1, 1)
from sklearn. preprocessing import OneHotEncoder
onehot = OneHotEncoder( ) . fit_transform( Y_train. reshape( - 1 , 1 ) )
onehot = onehot. toarray( )
print ( 'onehot' , onehot)
1.3.5 keras实现编码 keras.utils.np_util.sto_categorical(data)
from sklearn. preprocessing import LabelEncoder
from keras. utils import np_utils
Y
encoder = LabelEncoder( )
Y_labelencoded = encoder. fit_transform( Y)
Y_onehot = np_utils. to_categorical( Y_encoded)
1.4 string特征转int (df[‘f1’] == ‘Yes’).astype(int)
'''
== 'Yes'返回一串TrueFalse
.astype(int)转换成1 0
'''
original_data[ 'Attrition' ] = ( original_data[ 'Attrition' ] == 'Yes' ) . astype( int )
2. 连续变量分箱(连续变量编码)
2.1 原理
'''
字段 连续型 >>> 离散型
减少异常值影响,消除特征量纲影响
对于线性模型来说引入非线性因素,提升模型表现
对于树模型来说损失连续变量信息,影响模型效果
[0,30)->0 [30,60)->1 [60,inf)->2
ID Income ID Income_Level
1 0 1 0
2 10 2 0
3 180 >>> 3 2
4 30 4 1
5 55 5 1
'''
'''
等宽分箱 uniforme 一定程度受异常值影响
等频分箱 quantile 完全忽略异常值影响
聚类分箱 kmeans 兼顾变量原始数值分布,优先考虑
'''
2.2 等宽分箱 KBinsDiscretizer(strategy=‘uniform’)
income = np. array( [ 0 , 10 , 180 , 30 , 55 , 35 , 25 , 75 , 80 , 10 ] ) . reshape( - 1 , 1 )
income
'''
array([[ 0],
[ 10],
[180],
[ 30],
[ 55],
[ 35],
[ 25],
[ 75],
[ 80],
[ 10]])
'''
from sklearn. preprocessing import KBinsDiscretizer
'''
KBinsDiscretizer转化器 (discrete离散的)
n_bins 分箱个数
strategy 分箱方式
'uniforme' 等宽分箱
'quantile' 等频分箱
'kmeans' 聚类分箱
encode 分箱后的离散字段进一步编码方式
'ordinal' 二分类-自然数编码
'onehot' 多分类-独热编码
'''
dis = KBinsDiscretizer( n_bins= 3 , strategy= 'uniform' , encode= 'ordinal' )
dis. fit_transform( income)
'''
array([[0.],
[0.],
[2.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.]])
'''
dis. bin_edges_
'''
array([array([ 0., 60., 120., 180.])], dtype=object)
'''
2.3 等频分箱 KBinsDiscretizer(strategy=‘quantile’)
'''
根据分箱数和连续变量数,划分样本数量相等的区间
若样本数无法整除箱数,最后一个箱子包含余数样本(10/3 -> 3/3/4).
'''
np. sort( income. flatten( ) , axis= 0 )
'''
array([ 0, 10, 10, 25, 30, 35, 55, 75, 80, 180])
'''
dis = KBinsDiscretizer( n_bins= 3 , strategy= 'quantile' , encode= 'ordinal' )
dis. fit_transform( income)
'''
array([[0.],
[0.],
[2.],
[1.],
[1.],
[1.],
[0.],
[2.],
[2.],
[0.]])
'''
dis. bin_edges_
'''
array([array([ 0., 25., 55., 180.])], dtype=object)
'''
2.4 聚类分箱 KBinsDiscretizer(strategy=‘kmeans’)
from sklearn import cluster
kmeans = cluster. KMeans( n_clusters= 3 )
kmeans. fit( income)
kmeans. labels_
'''
array([0, 0, 1, 0, 2, 0, 0, 2, 2, 0], dtype=int32)
'''
dis = KBinsDiscretizer( n_bins= 3 , encode= 'ordinal' , strategy= 'kmeans' )
dis. fit_transform( income)
'''
array([[0.],
[0.],
[2.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.]])
'''
dis. bin_edges_
'''
array([array([ 0. , 44.16666667, 125. , 180. ])],
dtype=object)
'''
2.5 连续变量二值化
2.5.1 sklearn.preprocessing.Binarizer
data_2 = data. copy( )
data_2. loc[ : , 'Age' ] = data_2. loc[ : , 'Age' ] . fillna( data_2. loc[ : , 'Age' ] . mean( ) )
X = data_2. iloc[ : , 0 ] . values. reshape( - 1 , 1 )
X
'''
array([[22. ],
[38. ],
[26. ],
[28.71428571],
[35. ],
[28.71428571],
[58. ],
[20. ],
[ 2. ]])
'''
from sklearn. preprocessing import Binarizer
transformer = Binarizer( threshold= 25 ) . fit_transform( X)
transformer
'''
array([[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.]])
'''
2.5.2 DataFrame简单操作 (df[‘f1’] > 阈值).astype(int)
student[ 'age' ] = ( student[ 'age' ] > 12 ) . astype( int )
2.6 Regression Class Cutoff(连续值转整数寻找最佳划分阈值)
【notebook】xgboost, cudf, Regression Class Cutoff(连续值转类别整数划分最佳阈值)
from functools import partial
'''
回归预测的连续值通过划分归为固定类别
找到最佳的划分阈值
'''
class OptimizedRounder ( object ) :
def __init__ ( self) :
self. coef_ = 0
def _kappa_loss ( self, coef, X, y) :
X_p = np. copy( X)
for i, pred in enumerate ( X_p) :
if pred < coef[ 0 ] :
X_p[ i] = 3
elif pred >= coef[ 0 ] and pred < coef[ 1 ] :
X_p[ i] = 4
elif pred >= coef[ 1 ] and pred < coef[ 2 ] :
X_p[ i] = 5
elif pred >= coef[ 2 ] and pred < coef[ 3 ] :
X_p[ i] = 6
elif pred >= coef[ 3 ] and pred < coef[ 4 ] :
X_p[ i] = 7
else :
X_p[ i] = 8
'''
相关性系数
取负数,则kappa系数越小越好
'''
ll = cohen_kappa_score( y, X_p, weights= 'quadratic' )
return - ll
def fit ( self, X, y) :
loss_partial = partial( self. _kappa_loss, X= X, y= y)
initial_coef = [ 3.5 , 4.5 , 5.5 , 6.5 , 7.5 ]
'''
self._kappa_loss的参数Xy固定,
初始化coef=initial_coef,
找到最优coef
'''
self. coef_ = sp. optimize. minimize( loss_partial, initial_coef, method= 'nelder-mead' )
def predict ( self, X, coef) :
X_p = np. copy( X)
for i, pred in enumerate ( X_p) :
if pred < coef[ 0 ] :
X_p[ i] = 3
elif pred >= coef[ 0 ] and pred < coef[ 1 ] :
X_p[ i] = 4
elif pred >= coef[ 1 ] and pred < coef[ 2 ] :
X_p[ i] = 5
elif pred >= coef[ 2 ] and pred < coef[ 3 ] :
X_p[ i] = 6
elif pred >= coef[ 3 ] and pred < coef[ 4 ] :
X_p[ i] = 7
else :
X_p[ i] = 8
return X_p. astype( 'int' )
def coefficients ( self) :
return self. coef_[ 'x' ]
下面代码是在10折交叉验证中的某一个循环
optR = OptimizedRounder( )
optR. fit( xgb_valid_preds, val_target)
temp_oof = optR. predict( xgb_valid_preds, optR. coefficients( ) )