利用Python进行常见的特征工程_python字符型特征工程-CSDN博客

本文链接：https://blog.csdn.net/weixin_39293132/article/details/129728196

利用Python进行常见的特征工程

上期说到数据分析师一般对业务数据提取的时候就会进行数据清洗，也会做一些业务逻辑或者数据逻辑上的特征处理。但由于特征工程是数据建模重要的一环，所以这里就做一个简单的总结。希望能给大家带来一些小小地帮助～

首先给到一个特征工程概览图（如下）：

在这里插入图片描述

单特征操作

数据变换

离散变量-哑编码

import pandas as pd

# 构造数据
df = pd.DataFrame({'客户编号': [1, 2, 3], '性别': ['男', '女', '男']})
print(df)

# 哑编码
df1 = pd.get_dummies(df, columns=['性别'])
print('-'*30)
print(df1)

# 哑变量删除至n-1，防止虚拟变量陷阱造成的共线性问题
df2 = pd.get_dummies(df, columns=['性别'], drop_first=True)
print('-'*30)
print(df2)

   客户编号 性别
0     1  男
1     2  女
2     3  男
------------------------------
   客户编号  性别_女  性别_男
0     1     0     1
1     2     1     0
2     3     0     1
------------------------------
   客户编号  性别_男
0     1     1
1     2     0
2     3     1

离散变量-标签化

# 构造数据
df = pd.DataFrame({'编号': [1, 2, 3, 4, 5], '城市': ['北京', '上海', '广州', '深圳', '北京']})
print(df)

# 通过replace替换
df1 = df.copy()
df1['城市'] = df1['城市'].replace({'北京': 0, '上海': 1, '广州': 2, '深圳':3})
print('-'*30)
print(df1)

# 通过map替换
df2 = df.copy()
city_dic = {'北京': 0, '上海': 1, '广州': 2, '深圳':3}
df2['城市'] = df2['城市'].map(city_dic)
print('-'*30)
print(df2)

   编号  城市
0   1  北京
1   2  上海
2   3  广州
3   4  深圳
4   5  北京
------------------------------
   编号  城市
0   1   0
1   2   1
2   3   2
3   4   3
4   5   0
------------------------------
   编号  城市
0   1   0
1   2   1
2   3   2
3   4   3
4   5   0

# 通过LabelEncoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df3 = df.copy()
df3['城市'] = le.fit_transform(df3['城市'])
print(df3)

离散变量-独热编码

# 通过OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
ohe.fit_transform(df[['城市']]).toarray()

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])

连续变量-二值化

import pandas as pd

# 构造数据
df = pd.DataFrame({'age': range(1,5)})
print(df)

# 自定义
df1 = df.copy()
df1['age_b'] = df1['age'].map(lambda x: 0 if x<=2 else 1)
print('-'*30)
print(df1)

   age
0    1
1    2
2    3
3    4
------------------------------
   age  age_b
0    1      0
1    2      0
2    3      1
3    4      1

# Binarizer
from sklearn.preprocessing import Binarizer

df2 = df.copy()
df2['age_b'] = Binarizer(threshold=2).fit_transform(df2[['age']])
print(df2)

   age  age_b
0    1      0
1    2      0
2    3      1
3    4      1

连续变量-分箱

import pandas as pd

df = pd.DataFrame([[22,1],[25,1],[20,0],[35,0],[32,1],[38,0],[50,0],[46,1]], columns=['age', 'churn'])

# 自定义bins
print(pd.cut(df['age'], bins=[-1,20,50,99]))
# 自定义标签
print('-'*30)
print(pd.cut(df['age'], bins=[-1,20,50,99], labels=[0,1,2]))

0    (20, 50]
1    (20, 50]
2    (-1, 20]
3    (20, 50]
4    (20, 50]
5    (20, 50]
6    (20, 50]
7    (20, 50]
Name: age, dtype: category
Categories (3, interval[int64]): [(-1, 20] < (20, 50] < (50, 99]]
------------------------------
0    1
1    1
2    0
3    1
4    1
5    1
6    1
7    1
Name: age, dtype: category
Categories (3, int64): [0 < 1 < 2]

import toad

c = toad.transform.Combiner()

# 等频分箱
print(c.fit(df['age'], method='quantile').export())
# 等距分箱
print('-'*30)
print(c.fit(df['age'], method='step').export())
# 卡方分箱
print('-'*30)
print(c.fit(df, y='churn', method='chi').export())
# 决策分箱
print('-'*30)
print(c.fit(df, y='churn', method='dt').export())
# KMean分箱
print('-'*30)
print(c.fit(df['age'], method='kmeans', n_bins=3).export())

{'age': [21.4, 23.2, 25.700000000000003, 30.6, 33.5, 35.6, 37.7, 42.800000000000004, 47.2]}
------------------------------
{'age': [23.0, 26.0, 29.0, 32.0, 35.0, 38.0, 41.0, 44.0, 47.0]}
------------------------------
{'age': [22, 25, 32, 35, 38, 46, 50]}
------------------------------
{'age': [21.0, 33.5, 42.0, 48.0]}
------------------------------
{'age': [28.666666666666668, 41.5]}

连续变量-数据变换

import pandas as pd

# 构造数据
df = pd.DataFrame({'sales': [3,7,8,2,6,3,6]})

# 平方根变换
import numpy as np
df.insert(len(df.columns), 'sqrt',
         np.sqrt(df['sales']))

# 对数变换
import numpy as np
df.insert(len(df.columns), 'log',
         np.log(df['sales']))

# Box-Cox变换
from scipy.stats import boxcox
df.insert(len(df.columns), 'boxcox', 
              boxcox(df['sales'])[0])

# 自定义函数变换
from sklearn.preprocessing import FunctionTransformer

def my_func(x):
    return x/2

transformer = FunctionTransformer(my_func)
df.insert(len(df.columns), 'myfunc', 
              transformer.transform(df['sales']))

print(df)

   sales      sqrt       log    boxcox  myfunc
0      3  1.732051  1.098612  1.639046     1.5
1      7  2.645751  1.945910  4.078599     3.5
2      8  2.828427  2.079442  4.609387     4.0
3      2  1.414214  0.693147  0.887320     1.0
4      6  2.449490  1.791759  3.523320     3.0
5      3  1.732051  1.098612  1.639046     1.5
6      6  2.449490  1.791759  3.523320     3.0

标准化

归一化

import pandas as pd

# 构造数据
df = pd.DataFrame({'sales': [3,7,8], 'rand': [-1,3,5]})

# 自定义函数
df[['sales', 'rand']].apply(lambda x : (x-np.min(x))/(np.max(x)-np.min(x)))

	sales	rand
0	0.0	0.000000
1	0.8	0.666667
2	1.0	1.000000

# MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

min_max_scaler.fit_transform(df)

z标准化

# 自定义函数
df.apply(lambda x : (x-x.mean())/ x.std(ddof=0))

	sales	rand
0	-1.38873	-1.336306
1	0.46291	0.267261
2	0.92582	1.069045

# scale
from sklearn.preprocessing import scale

scale(df)

array([[-1.38873015, -1.33630621],
       [ 0.46291005,  0.26726124],
       [ 0.9258201 ,  1.06904497]])

# StandardScaler
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
std_scaler.fit_transform(df)

array([[-1.38873015, -1.33630621],
       [ 0.46291005,  0.26726124],
       [ 0.9258201 ,  1.06904497]])

正则化

# Normalizer
from sklearn.preprocessing import Normalizer

norm = Normalizer()
norm.fit_transform(df)

array([[ 0.9486833 , -0.31622777],
       [ 0.91914503,  0.3939193 ],
       [ 0.8479983 ,  0.52999894]])

特殊数据处理

缺失值-删除

df = pd.DataFrame([[1, np.nan, 3], [np.nan, np.nan, np.nan], [np.nan, np.nan, 0]], columns=['c1', 'c2', 'c3'])
print(df)

# 删除含nan的行
print('-'*30)
print(df.dropna())
# 删除全为nan的行
print('-'*30)
print(df.dropna(how='all'))
# 设置thresh参数，比如将其设置为n，那么其含义是如果该行的非空值少于n个则删除该行
print('-'*30)
print(df.dropna(thresh=2))

    c1  c2   c3
0  1.0 NaN  3.0
1  NaN NaN  NaN
2  NaN NaN  0.0
------------------------------
Empty DataFrame
Columns: [c1, c2, c3]
Index: []
------------------------------
    c1  c2   c3
0  1.0 NaN  3.0
2  NaN NaN  0.0
------------------------------
    c1  c2   c3
0  1.0 NaN  3.0

缺失值-填充

# 随机填充
import random

print(df.fillna(int(random.random()*10)))

# 常数填充
print('-'*30)
print(df.fillna(0))

# 均值填充
print('-'*30)
print(df.fillna(df.mean()))

# 中位数填充
print('-'*30)
print(df.fillna(df.median()))

# 插值填充
print('-'*30)
print(df.fillna(method='ffill'))

    c1   c2   c3
0  1.0  8.0  3.0
1  8.0  8.0  8.0
2  8.0  8.0  0.0
------------------------------
    c1   c2   c3
0  1.0  0.0  3.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
------------------------------
    c1  c2   c3
0  1.0 NaN  3.0
1  1.0 NaN  1.5
2  1.0 NaN  0.0
------------------------------
    c1  c2   c3
0  1.0 NaN  3.0
1  1.0 NaN  1.5
2  1.0 NaN  0.0
------------------------------
    c1  c2   c3
0  1.0 NaN  3.0
1  1.0 NaN  3.0
2  1.0 NaN  0.0

重复值删除

# 构造数据
df = pd.DataFrame([[1, 2, 3], [1, 2, 3], [4, 5, 6]], columns=['c1', 'c2', 'c3'])
print(df)

# 删除重复值
print('-'*30)
print(df.drop_duplicates())

   c1  c2  c3
0   1   2   3
1   1   2   3
2   4   5   6
------------------------------
   c1  c2  c3
0   1   2   3
2   4   5   6

异常值查找

# 构造数据
df = pd.DataFrame({'c1': [3, 10, 5, 7, 1, 9, 93], 
                   'c2': [15, 16, 14, 78, 19, 11, 8], 
                   'c3': [20, 15, 18, 21, 101, 27, 29]}, 
                  columns=['c1', 'c2', 'c3'])
print(df)

# 箱线图
import matplotlib.pyplot as plt

print('-'*30)
df.boxplot()
plt.show()

# 3sigma
z = lambda x: (x-x.mean())/ x.std(ddof=0)
print('-'*30)
print(df[df.apply(z)>2].dropna(how='all'))

   c1  c2   c3
0   3  15   20
1  10  16   15
2   5  14   18
3   7  78   21
4   1  19  101
5   9  11   27
6  93   8   29
------------------------------

output_31_1

------------------------------
     c1    c2     c3
3   NaN  78.0    NaN
4   NaN   NaN  101.0
6  93.0   NaN    NaN

多特征操作

多重共线性

from sklearn.datasets import load_iris

iris = load_iris()
X = pd.DataFrame(iris.data)
X.columns=['sl', 'sw', 'pl', 'pw']
y = pd.DataFrame(iris.target)
y.columns = ['y']

df = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)
print(df.head())

    sl   sw   pl   pw  y
0  5.1  3.5  1.4  0.2  0
1  4.9  3.0  1.4  0.2  0
2  4.7  3.2  1.3  0.2  0
3  4.6  3.1  1.5  0.2  0
4  5.0  3.6  1.4  0.2  0

相关系数

# 相关系数
df.corr()

	sl	sw	pl	pw	y
sl	1.000000	-0.117570	0.871754	0.817941	0.782561
sw	-0.117570	1.000000	-0.428440	-0.366126	-0.426658
pl	0.871754	-0.428440	1.000000	0.962865	0.949035
pw	0.817941	-0.366126	0.962865	1.000000	0.956547
y	0.782561	-0.426658	0.949035	0.956547	1.000000

方差膨胀因子

# 方差膨胀因子
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = [variance_inflation_factor(X.values, X.columns.get_loc(i)) for i in X.columns]
print(vif)

# VIF均大于10，剔除最高的sl
X1=df.drop(columns=['sl', 'y'])
vif = [variance_inflation_factor(X1.values, X1.columns.get_loc(i)) for i in X1.columns]
print('-'*30)
print(vif)

# 依然存在VIF大于10，剔除最高的pl
X2=df.drop(columns=['sl', 'pl', 'y'])
vif = [variance_inflation_factor(X2.values, X2.columns.get_loc(i)) for i in X2.columns]
print('-'*30)
print(vif)

[262.9693482414677, 96.35329172369063, 172.96096155387588, 55.50205979323753]
------------------------------
[5.856964572603174, 62.071308334041554, 43.2925737234071]
------------------------------
[2.891774016941542, 2.8917740169415427]

降维

主成分分析

# PCA
from sklearn.decomposition import PCA

PCA(n_components=2).fit_transform(iris.data)[0:3]

array([[-2.68412563,  0.31939725],
       [-2.71414169, -0.17700123],
       [-2.88899057, -0.14494943]])

线性判别式

# LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

LDA(n_components=2).fit_transform(iris.data, iris.target)[0:3]

array([[ 8.06179978,  0.30042062],
       [ 7.12868772, -0.78666043],
       [ 7.48982797, -0.26538449]])

构造新特征

多项式拟合

# PolynomialFeatures
from sklearn.preprocessing import PolynomialFeatures

pf = PolynomialFeatures().fit_transform(iris.data)
print(pf[0:3])
# 查看多项式拟合后的shape
print('-'*30)
print(pf.shape)

[[ 1.    5.1   3.5   1.4   0.2  26.01 17.85  7.14  1.02 12.25  4.9   0.7
   1.96  0.28  0.04]
 [ 1.    4.9   3.    1.4   0.2  24.01 14.7   6.86  0.98  9.    4.2   0.6
   1.96  0.28  0.04]
 [ 1.    4.7   3.2   1.3   0.2  22.09 15.04  6.11  0.94 10.24  4.16  0.64
   1.69  0.26  0.04]]
------------------------------
(150, 15)

特征选择

相关系数、缺失值、iv值

# toad
import toad 

# 相关系数、缺失值、iv值（小于0.1认为是弱预测能力）选择
toad.selection.select(df, df.y, 
                       empty=0.7, iv=0.1, 
                       corr=0.95, 
                       return_drop=True)

(      sl   sw   pl  y
 0    5.1  3.5  1.4  0
 1    4.9  3.0  1.4  0
 2    4.7  3.2  1.3  0
 3    4.6  3.1  1.5  0
 4    5.0  3.6  1.4  0
 ..   ...  ...  ... ..
 145  6.7  3.0  5.2  2
 146  6.3  2.5  5.0  2
 147  6.5  3.0  5.2  2
 148  6.2  3.4  5.4  2
 149  5.9  3.0  5.1  2
 
 [150 rows x 4 columns],
 {'empty': array([], dtype=float64),
  'iv': array([], dtype=object),
  'corr': array(['pw'], dtype=object)})

from sklearn.model_selection import train_test_split

# 样本拆分
train, test = train_test_split(
    df, test_size=.3, random_state=0)

# 计算psi（大于0.02认为是不稳定）
np.seterr(divide='ignore',invalid='ignore') # 防止0/0产生的invalid value
toad.metrics.PSI(train, test).sort_values(0)

y     0.081994
pl    0.316619
pw    0.418114
sw    0.425005
sl    0.762664
dtype: float64

逐步回归

# 逐步回归
toad.selection.stepwise(df,  
                          df.y, 
                          direction='both',   
                          criterion='aic',  
                          estimator='ols',
                      intercept=False).head()

	sl	y
0	5.1	0
1	4.9	0
2	4.7	0
3	4.6	0
4	5.0	0

Filter：过滤法

# 方差选择法
from sklearn.feature_selection import VarianceThreshold

VarianceThreshold(threshold=3).fit_transform(iris.data)[0:3]

array([[1.4],
       [1.4],
       [1.3]])

# 相关系数法
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr

r = lambda X, Y: np.array(list(map(lambda x:pearsonr(x, Y)[0], X.T))).T
SelectKBest(r, k=2).fit_transform(iris.data, iris.target)[0:3]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2]])

# 卡方检验
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)[0:3]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2]])

# 互信息法
from sklearn.feature_selection import SelectKBest
from sklearn import metrics

mic = metrics.mutual_info_score
g = lambda X, Y: np.array(list(map(lambda x:mic(x, Y), X.T))).T

SelectKBest(g, k=2).fit_transform(iris.data, iris.target)[0:3]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2]])

Wrapper：包装法

# 递归特征消除法
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

RFE(estimator=LogisticRegression(solver='liblinear'), 
    n_features_to_select=2).fit_transform(iris.data, iris.target)[0:3]

array([[3.5, 0.2],
       [3. , 0.2],
       [3.2, 0.2]])

Embedded：嵌入法

# 基于惩罚项的特征选择法
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

SelectFromModel(LogisticRegression(penalty="l1", C=0.1,
                   solver='liblinear')).fit_transform(iris.data, iris.target)[0:3]

array([[5.1, 3.5, 1.4],
       [4.9, 3. , 1.4],
       [4.7, 3.2, 1.3]])

# 基于树模型的特征选择法
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier

SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)[0:3]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2]])

样本不平衡

过采样

# 过采样
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
X_smotesampled, y_smotesampled = smote.fit_resample(iris.data, iris.target)

欠采样

# 欠采样
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)
X_undersampled, y_undersampled = rus.fit_resample(iris.data, iris.target)