【Python】特征编码

rejudge

已于 2023-05-16 14:56:55 修改

阅读量1.4k

点赞数 4

分类专栏： Python 文章标签： python pandas sklearn

于 2023-03-09 16:19:51 首次发布

本文链接：https://blog.csdn.net/qq_45249685/article/details/129425245

版权

Python 专栏收录该内容

43 篇文章 3 订阅

订阅专栏

特征编码

1. 离散变量编码
2. 连续变量分箱(连续变量编码)

1. 离散变量编码

import pandas as pd
# index_col=0 第0列是索引
data = pd.read_csv('./data_preprocessing.csv', index_col=0)
data_ = data.copy()

1.1 标签专用 sklearn.preprocessing.LabelEncoder

1.1.1 单列编码

# 标签专用
from sklearn.preprocessing import LabelEncoder

# data.loc[:,'Survived']
# .iloc[] indexloc不能用列标签
y = data_.iloc[:,-1]
le = LabelEncoder()
label = le.fit_transform(y)
data_.iloc[:,-1] = label
data_

	Age	Sex	Embarked	Survived
0	22.0	male	S	0
1	38.0	female	C	2
2	26.0	female	S	2
3	NaN	female	S	2
4	35.0	male	S	0
5	NaN	male	Q	1
6	58.0	male	Q	0
7	20.0	female	C	2
8	2.0	female	S	1

1.1.2 多标签特征同时编码(封装类方式)

from sklearn.preprocessing import LabelEncoder


class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        如果指定了待编码的特证名self.columns，则按照X中特征编码；
        如果未指定，则遍历传入DataFrame的所有特征依次进行编码。
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

label_encoder = MultiColumnLabelEncoder(columns=columns_with_strings_as_values)
train = label_encoder.fit_transform(train)
test = label_encoder.transform(test)

1.2 特征专用(不能是一维) sklearn.preprocessing.OrdinalEncoder

# 特征专用（不能是一维）
from sklearn.preprocessing import OrdinalEncoder

# data_.iloc[:,-1]   范围最后一列n
# data_.iloc[:,1:-1] 范围[1, n)
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])
data_

	Age	Sex	Embarked	Survived
0	22.0	1.0	2.0	0
1	38.0	0.0	0.0	2
2	26.0	0.0	2.0	2
3	NaN	0.0	2.0	2
4	35.0	1.0	2.0	0
5	NaN	1.0	1.0	1
6	58.0	1.0	1.0	0
7	20.0	0.0	0.0	2
8	2.0	0.0	2.0	1

1.3 独热编码(离散变量编码) sklearn.preprocessing.OneHotEncoder

1.3.1 原理 & 过程

原理

'''
名义变量：没有联系，不可互相计算。OneHotEncoder。舱门(S,C,Q)、性别;
有序变量：有联系，不可互相计算。OrdinalEncoder。学历(小学,高中,大学);
有距变量：有联系，可互相计算。重量(2kg,4kg,6kg).
单纯转为(1,2,3),自带数学性质影响建模。
OneHotEncoder独热编码，名义变量>>>哑变量
'S'[0,          'S'[[1,0,0],
'Q' 1,   >>>    'Q' [0,1,0], 
'C' 2]          'C' [0,0,1]]
'''

'''
二分类离散变量，转换后知到一列取值已知则另一列取值也确定
OneHotEncoder(drop='if_binary') 跳过二分类，只对多分类离散变量进行转化
ID Gender     ID Gender_F Gender_M
1  F          1  1        0
2  M     >>>  2  0        1
3  M          3  0        1
4  F          4  1        0
ID Gender Income     ID Gender Income_High Income_medium Income_Low
1  F      High       1  0      1           0             0 
2  M      Medium >>> 2  1      0           1             0
3  M      High       3  1      1           0             0
4  F      Low        4  0      0           0             1
'''

数据

X = pd.DataFrame({'Gender': ['F', 'M', 'M', 'F'],
                  'Income': ['High', 'Medium', 'High', 'Low']})
X

	Gender	Income
0	F	High
1	M	Medium
2	M	High
3	F	Low

代码

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(drop='if_binary')
enc.fit_transform(X).toarray()
'''
    array([[0., 1., 0., 0.],
           [1., 0., 0., 1.],
           [1., 1., 0., 0.],
           [0., 0., 1., 0.]])
'''

# 转换规则
'''
二分类 F >>> 0,M >>> 1
多分类 第一列High，第二列Low，第三列Medium
'''
enc.categories_
'''
    [array(['F', 'M'], dtype=object),
     array(['High', 'Low', 'Medium'], dtype=object)]
'''

# 编码后命名列 原列名_字段取值
# 原始列名
cate_cols = X.columns.tolist()
cate_cols
'''
    ['Gender', 'Income']
'''

# 新编码字段名称存储
cate_cols_new = []
# 提取独热编码后所有特征的名称
for idx, colname in enumerate(cate_cols):
    # 二分类离散变量
    if len(enc.categories_[idx]) == 2:
        cate_cols_new.append(colname)
    # 多分类离散变量
    else:
        for f in enc.categories_[idx]:
            feature_name = colname + '_' + f
            cate_cols_new.append(feature_name)
cate_cols_new
'''
    ['Gender', 'Income_High', 'Income_Low', 'Income_Medium']
'''

# 组合成新DataFrame
pd.DataFrame(enc.fit_transform(X).toarray()
            ,columns=cate_cols_new)

	Gender	Income_High	Income_Low	Income_Medium
0	0.0	1.0	0.0	0.0
1	1.0	0.0	0.0	1.0
2	1.0	1.0	0.0	0.0
3	0.0	0.0	1.0	0.0

1.3.2 封装函数

def cate_colName(Transformer, category_cols, drop='if_binary'):
    """
    离散字段独热编码后字段名创建函数
    
    :param Transformer: 独热编码转化器
    :param category_cols: 原始列名
    :param drop: 独热编码转化器的drop参数
    """
    
    # 新编码字段名称存储
    cate_cols_new = []
    col_value = Transformer.categories_
    # 提取独热编码后所有特征的名称
    for idx, colname in enumerate(cate_cols):
        # 二分类离散变量
        if (len(col_value[idx]) == 2) & (drop == 'if_binary'):
            cate_cols_new.append(colname)
        # 多分类离散变量
        else:
            for f in col_value[idx]:
                feature_name = colname + '_' + f
                cate_cols_new.append(feature_name)
    return (cate_cols_new)

cate_colName(enc, cate_cols)
'''
    ['Gender', 'Income_High', 'Income_Low', 'Income_Medium']
'''

1.3.3 多维数据编码

# 独热编码
from sklearn.preprocessing import OneHotEncoder
X = data.iloc[:,1:-1] # 不能是一维
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
result # 5列 Sex2+Embarked3 
'''
   array([[0., 1., 0., 0., 1.],
           [1., 0., 1., 0., 0.],
           [1., 0., 0., 0., 1.],
           [1., 0., 0., 0., 1.],
           [0., 1., 0., 0., 1.],
           [0., 1., 0., 1., 0.],
           [0., 1., 0., 1., 0.],
           [1., 0., 1., 0., 0.],
           [1., 0., 0., 0., 1.]])
'''

# 更新到原数据
# axis=1 在1维相连
newdata = pd.concat([data, pd.DataFrame(result)], axis=1)
newdata.drop(['Sex','Embarked'], axis=1, inplace=True)
newdata

	Age	Survived	0	1	2	3	4
0	22.0	No	0.0	1.0	0.0	0.0	1.0
1	38.0	Yes	1.0	0.0	1.0	0.0	0.0
2	26.0	Yes	1.0	0.0	0.0	0.0	1.0
3	NaN	Yes	1.0	0.0	0.0	0.0	1.0
4	35.0	No	0.0	1.0	0.0	0.0	1.0
5	NaN	Unknown	0.0	1.0	0.0	1.0	0.0
6	58.0	No	0.0	1.0	0.0	1.0	0.0
7	20.0	Yes	1.0	0.0	1.0	0.0	0.0
8	2.0	Unknown	1.0	0.0	0.0	0.0	1.0

print(enc.get_feature_names())
newdata.columns = ['Age','Survived','female','male','Embarked_C','Embarked_Q','Embarked_S']
newdata

[‘x0_female’ ‘x0_male’ ‘x1_C’ ‘x1_Q’ ‘x1_S’]

	Age	Survived	female	male	Embarked_C	Embarked_Q	Embarked_S
0	22.0	No	0.0	1.0	0.0	0.0	1.0
1	38.0	Yes	1.0	0.0	1.0	0.0	0.0
2	26.0	Yes	1.0	0.0	0.0	0.0	1.0
3	NaN	Yes	1.0	0.0	0.0	0.0	1.0
4	35.0	No	0.0	1.0	0.0	0.0	1.0
5	NaN	Unknown	0.0	1.0	0.0	1.0	0.0
6	58.0	No	0.0	1.0	0.0	1.0	0.0
7	20.0	Yes	1.0	0.0	1.0	0.0	0.0
8	2.0	Unknown	1.0	0.0	0.0	0.0	1.0

1.3.4 一维数据编码 .reshape(-1, 1)

from sklearn.preprocessing import OneHotEncoder

# (n, )维数据.reshape(-1, 1) >>> (n, 1)
onehot = OneHotEncoder().fit_transform(Y_train.reshape(-1, 1))
# 转array类型, 才是需要的结果
onehot = onehot.toarray()
print('onehot', onehot)

1.3.5 keras实现编码 keras.utils.np_util.sto_categorical(data)

from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

# 数值是1 2 3
Y

# 数值转为0 1 2
encoder = LabelEncoder()
Y_labelencoded = encoder.fit_transform(Y)
# 转为独热编码 类型是<class 'numpy.ndarray'>
Y_onehot = np_utils.to_categorical(Y_encoded)

# 和上面方法的结果是完全一样的

1.4 string特征转int (df[‘f1’] == ‘Yes’).astype(int)

'''
== 'Yes'返回一串TrueFalse 
.astype(int)转换成1 0
'''
original_data['Attrition'] = (original_data['Attrition'] == 'Yes').astype(int)

2. 连续变量分箱(连续变量编码)

2.1 原理

'''
字段 连续型 >>> 离散型
减少异常值影响，消除特征量纲影响
对于线性模型来说引入非线性因素，提升模型表现
对于树模型来说损失连续变量信息，影响模型效果

[0,30)->0 [30,60)->1 [60,inf)->2
ID Income       ID Income_Level
1  0            1  0
2  10           2  0
3  180   >>>    3  2
4  30           4  1
5  55           5  1
'''
'''
等宽分箱 uniforme 一定程度受异常值影响
等频分箱 quantile 完全忽略异常值影响
聚类分箱 kmeans 兼顾变量原始数值分布,优先考虑
'''

2.2 等宽分箱 KBinsDiscretizer(strategy=‘uniform’)

# 等宽分箱
# 根据连续变量的取值范围，划分宽度相等的区间
income = np.array([0, 10, 180, 30, 55, 35, 25, 75, 80, 10]).reshape(-1, 1)
income
'''
    array([[  0],
           [ 10],
           [180],
           [ 30],
           [ 55],
           [ 35],
           [ 25],
           [ 75],
           [ 80],
           [ 10]])
'''

from sklearn.preprocessing import KBinsDiscretizer
'''
KBinsDiscretizer转化器 (discrete离散的)
    n_bins 分箱个数
    strategy 分箱方式
        'uniforme' 等宽分箱
        'quantile' 等频分箱
        'kmeans' 聚类分箱
    encode 分箱后的离散字段进一步编码方式
        'ordinal' 二分类-自然数编码
        'onehot' 多分类-独热编码
'''

dis = KBinsDiscretizer(n_bins=3, strategy='uniform', encode='ordinal')
dis.fit_transform(income)
'''
    array([[0.],
           [0.],
           [2.],
           [0.],
           [0.],
           [0.],
           [0.],
           [1.],
           [1.],
           [0.]])
'''

# 查看分箱边界
dis.bin_edges_
'''
    array([array([  0.,  60., 120., 180.])], dtype=object)
'''

2.3 等频分箱 KBinsDiscretizer(strategy=‘quantile’)

'''
根据分箱数和连续变量数，划分样本数量相等的区间
若样本数无法整除箱数，最后一个箱子包含余数样本(10/3 -> 3/3/4).
'''
np.sort(income.flatten(), axis=0) # 分两个箱的话会以32.5划分
'''
    array([  0,  10,  10,  25,  30,  35,  55,  75,  80, 180])
'''

dis = KBinsDiscretizer(n_bins=3, strategy='quantile', encode='ordinal')
dis.fit_transform(income)
'''
    array([[0.],
           [0.],
           [2.],
           [1.],
           [1.],
           [1.],
           [0.],
           [2.],
           [2.],
           [0.]])
'''

# 查看分箱边界
dis.bin_edges_
'''
    array([array([  0.,  25.,  55., 180.])], dtype=object)
'''

2.4 聚类分箱 KBinsDiscretizer(strategy=‘kmeans’)

# 对连续变量进行聚类(多KMeans聚类)，按样本所属类别作为标记代替原始值
from sklearn import cluster

kmeans = cluster.KMeans(n_clusters=3)
kmeans.fit(income)
kmeans.labels_
'''
    array([0, 0, 1, 0, 2, 0, 0, 2, 2, 0], dtype=int32)
'''

dis = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')
dis.fit_transform(income) # 分类结果和上面相同但更合理，小数字更能体现收入水平低
'''
    array([[0.],
           [0.],
           [2.],
           [0.],
           [1.],
           [0.],
           [0.],
           [1.],
           [1.],
           [0.]])
'''

dis.bin_edges_
'''
    array([array([  0.        ,  44.16666667, 125.        , 180.        ])],
          dtype=object)
'''

2.5 连续变量二值化

2.5.1 sklearn.preprocessing.Binarizer

# Age二值化
data_2 = data.copy()
data_2.loc[:,'Age'] = data_2.loc[:,'Age'].fillna(data_2.loc[:,'Age'].mean())
X = data_2.iloc[:,0].values.reshape(-1,1)
X
'''
    array([[22.        ],
           [38.        ],
           [26.        ],
           [28.71428571],
           [35.        ],
           [28.71428571],
           [58.        ],
           [20.        ],
           [ 2.        ]])
'''

from sklearn.preprocessing import Binarizer

transformer = Binarizer(threshold=25).fit_transform(X)
transformer # 25为边界
'''

    array([[0.],
           [1.],
           [1.],
           [1.],
           [1.],
           [1.],
           [1.],
           [0.],
           [0.]])
'''

2.5.2 DataFrame简单操作 (df[‘f1’] > 阈值).astype(int)

student['age'] = (student['age'] > 12).astype(int)

2.6 Regression Class Cutoff(连续值转整数寻找最佳划分阈值)

【notebook】xgboost, cudf, Regression Class Cutoff(连续值转类别整数划分最佳阈值)

from functools import partial

'''
回归预测的连续值通过划分归为固定类别
找到最佳的划分阈值
'''
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        '''
        相关性系数
        取负数，则kappa系数越小越好
        '''
        ll = cohen_kappa_score(y, X_p, weights='quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [3.5, 4.5, 5.5, 6.5, 7.5]
        '''
        self._kappa_loss的参数Xy固定，
        初始化coef=initial_coef,
        找到最优coef
        '''
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        return X_p.astype('int')

    def coefficients(self):
        return self.coef_['x']

下面代码是在10折交叉验证中的某一个循环

# 根据预测结果和真实值划分最佳的归类阈值
optR = OptimizedRounder()
optR.fit(xgb_valid_preds, val_target)
# 本轮在验证集上的预测结果
temp_oof = optR.predict(xgb_valid_preds, optR.coefficients())