利用Python进行常见的特征工程
上期说到数据分析师一般对业务数据提取的时候就会进行数据清洗,也会做一些业务逻辑或者数据逻辑上的特征处理。但由于特征工程是数据建模重要的一环,所以这里就做一个简单的总结。希望能给大家带来一些小小地帮助~
首先给到一个特征工程概览图(如下):
单特征操作
数据变换
import pandas as pd
df = pd. DataFrame( { '客户编号' : [ 1 , 2 , 3 ] , '性别' : [ '男' , '女' , '男' ] } )
print ( df)
df1 = pd. get_dummies( df, columns= [ '性别' ] )
print ( '-' * 30 )
print ( df1)
df2 = pd. get_dummies( df, columns= [ '性别' ] , drop_first= True )
print ( '-' * 30 )
print ( df2)
客户编号 性别
0 1 男
1 2 女
2 3 男
------------------------------
客户编号 性别_女 性别_男
0 1 0 1
1 2 1 0
2 3 0 1
------------------------------
客户编号 性别_男
0 1 1
1 2 0
2 3 1
df = pd. DataFrame( { '编号' : [ 1 , 2 , 3 , 4 , 5 ] , '城市' : [ '北京' , '上海' , '广州' , '深圳' , '北京' ] } )
print ( df)
df1 = df. copy( )
df1[ '城市' ] = df1[ '城市' ] . replace( { '北京' : 0 , '上海' : 1 , '广州' : 2 , '深圳' : 3 } )
print ( '-' * 30 )
print ( df1)
df2 = df. copy( )
city_dic = { '北京' : 0 , '上海' : 1 , '广州' : 2 , '深圳' : 3 }
df2[ '城市' ] = df2[ '城市' ] . map ( city_dic)
print ( '-' * 30 )
print ( df2)
编号 城市
0 1 北京
1 2 上海
2 3 广州
3 4 深圳
4 5 北京
------------------------------
编号 城市
0 1 0
1 2 1
2 3 2
3 4 3
4 5 0
------------------------------
编号 城市
0 1 0
1 2 1
2 3 2
3 4 3
4 5 0
from sklearn. preprocessing import LabelEncoder
le = LabelEncoder( )
df3 = df. copy( )
df3[ '城市' ] = le. fit_transform( df3[ '城市' ] )
print ( df3)
编号 城市
0 1 1
1 2 0
2 3 2
3 4 3
4 5 1
from sklearn. preprocessing import OneHotEncoder
ohe = OneHotEncoder( )
ohe. fit_transform( df[ [ '城市' ] ] ) . toarray( )
array([[0., 1., 0., 0.],
[1., 0., 0., 0.],
[0., 0., 1., 0.],
[0., 0., 0., 1.],
[0., 1., 0., 0.]])
import pandas as pd
df = pd. DataFrame( { 'age' : range ( 1 , 5 ) } )
print ( df)
df1 = df. copy( )
df1[ 'age_b' ] = df1[ 'age' ] . map ( lambda x: 0 if x<= 2 else 1 )
print ( '-' * 30 )
print ( df1)
age
0 1
1 2
2 3
3 4
------------------------------
age age_b
0 1 0
1 2 0
2 3 1
3 4 1
from sklearn. preprocessing import Binarizer
df2 = df. copy( )
df2[ 'age_b' ] = Binarizer( threshold= 2 ) . fit_transform( df2[ [ 'age' ] ] )
print ( df2)
age age_b
0 1 0
1 2 0
2 3 1
3 4 1
import pandas as pd
df = pd. DataFrame( [ [ 22 , 1 ] , [ 25 , 1 ] , [ 20 , 0 ] , [ 35 , 0 ] , [ 32 , 1 ] , [ 38 , 0 ] , [ 50 , 0 ] , [ 46 , 1 ] ] , columns= [ 'age' , 'churn' ] )
print ( pd. cut( df[ 'age' ] , bins= [ - 1 , 20 , 50 , 99 ] ) )
print ( '-' * 30 )
print ( pd. cut( df[ 'age' ] , bins= [ - 1 , 20 , 50 , 99 ] , labels= [ 0 , 1 , 2 ] ) )
0 (20, 50]
1 (20, 50]
2 (-1, 20]
3 (20, 50]
4 (20, 50]
5 (20, 50]
6 (20, 50]
7 (20, 50]
Name: age, dtype: category
Categories (3, interval[int64]): [(-1, 20] < (20, 50] < (50, 99]]
------------------------------
0 1
1 1
2 0
3 1
4 1
5 1
6 1
7 1
Name: age, dtype: category
Categories (3, int64): [0 < 1 < 2]
import toad
c = toad. transform. Combiner( )
print ( c. fit( df[ 'age' ] , method= 'quantile' ) . export( ) )
print ( '-' * 30 )
print ( c. fit( df[ 'age' ] , method= 'step' ) . export( ) )
print ( '-' * 30 )
print ( c. fit( df, y= 'churn' , method= 'chi' ) . export( ) )
print ( '-' * 30 )
print ( c. fit( df, y= 'churn' , method= 'dt' ) . export( ) )
print ( '-' * 30 )
print ( c. fit( df[ 'age' ] , method= 'kmeans' , n_bins= 3 ) . export( ) )
{'age': [21.4, 23.2, 25.700000000000003, 30.6, 33.5, 35.6, 37.7, 42.800000000000004, 47.2]}
------------------------------
{'age': [23.0, 26.0, 29.0, 32.0, 35.0, 38.0, 41.0, 44.0, 47.0]}
------------------------------
{'age': [22, 25, 32, 35, 38, 46, 50]}
------------------------------
{'age': [21.0, 33.5, 42.0, 48.0]}
------------------------------
{'age': [28.666666666666668, 41.5]}
import pandas as pd
df = pd. DataFrame( { 'sales' : [ 3 , 7 , 8 , 2 , 6 , 3 , 6 ] } )
import numpy as np
df. insert( len ( df. columns) , 'sqrt' ,
np. sqrt( df[ 'sales' ] ) )
import numpy as np
df. insert( len ( df. columns) , 'log' ,
np. log( df[ 'sales' ] ) )
from scipy. stats import boxcox
df. insert( len ( df. columns) , 'boxcox' ,
boxcox( df[ 'sales' ] ) [ 0 ] )
from sklearn. preprocessing import FunctionTransformer
def my_func ( x) :
return x/ 2
transformer = FunctionTransformer( my_func)
df. insert( len ( df. columns) , 'myfunc' ,
transformer. transform( df[ 'sales' ] ) )
print ( df)
sales sqrt log boxcox myfunc
0 3 1.732051 1.098612 1.639046 1.5
1 7 2.645751 1.945910 4.078599 3.5
2 8 2.828427 2.079442 4.609387 4.0
3 2 1.414214 0.693147 0.887320 1.0
4 6 2.449490 1.791759 3.523320 3.0
5 3 1.732051 1.098612 1.639046 1.5
6 6 2.449490 1.791759 3.523320 3.0
标准化
import pandas as pd
df = pd. DataFrame( { 'sales' : [ 3 , 7 , 8 ] , 'rand' : [ - 1 , 3 , 5 ] } )
df[ [ 'sales' , 'rand' ] ] . apply ( lambda x : ( x- np. min ( x) ) / ( np. max ( x) - np. min ( x) ) )
sales rand 0 0.0 0.000000 1 0.8 0.666667 2 1.0 1.000000
from sklearn. preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler( )
min_max_scaler. fit_transform( df)
df. apply ( lambda x : ( x- x. mean( ) ) / x. std( ddof= 0 ) )
sales rand 0 -1.38873 -1.336306 1 0.46291 0.267261 2 0.92582 1.069045
from sklearn. preprocessing import scale
scale( df)
array([[-1.38873015, -1.33630621],
[ 0.46291005, 0.26726124],
[ 0.9258201 , 1.06904497]])
from sklearn. preprocessing import StandardScaler
std_scaler = StandardScaler( )
std_scaler. fit_transform( df)
array([[-1.38873015, -1.33630621],
[ 0.46291005, 0.26726124],
[ 0.9258201 , 1.06904497]])
from sklearn. preprocessing import Normalizer
norm = Normalizer( )
norm. fit_transform( df)
array([[ 0.9486833 , -0.31622777],
[ 0.91914503, 0.3939193 ],
[ 0.8479983 , 0.52999894]])
特殊数据处理
df = pd. DataFrame( [ [ 1 , np. nan, 3 ] , [ np. nan, np. nan, np. nan] , [ np. nan, np. nan, 0 ] ] , columns= [ 'c1' , 'c2' , 'c3' ] )
print ( df)
print ( '-' * 30 )
print ( df. dropna( ) )
print ( '-' * 30 )
print ( df. dropna( how= 'all' ) )
print ( '-' * 30 )
print ( df. dropna( thresh= 2 ) )
c1 c2 c3
0 1.0 NaN 3.0
1 NaN NaN NaN
2 NaN NaN 0.0
------------------------------
Empty DataFrame
Columns: [c1, c2, c3]
Index: []
------------------------------
c1 c2 c3
0 1.0 NaN 3.0
2 NaN NaN 0.0
------------------------------
c1 c2 c3
0 1.0 NaN 3.0
import random
print ( df. fillna( int ( random. random( ) * 10 ) ) )
print ( '-' * 30 )
print ( df. fillna( 0 ) )
print ( '-' * 30 )
print ( df. fillna( df. mean( ) ) )
print ( '-' * 30 )
print ( df. fillna( df. median( ) ) )
print ( '-' * 30 )
print ( df. fillna( method= 'ffill' ) )
c1 c2 c3
0 1.0 8.0 3.0
1 8.0 8.0 8.0
2 8.0 8.0 0.0
------------------------------
c1 c2 c3
0 1.0 0.0 3.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
------------------------------
c1 c2 c3
0 1.0 NaN 3.0
1 1.0 NaN 1.5
2 1.0 NaN 0.0
------------------------------
c1 c2 c3
0 1.0 NaN 3.0
1 1.0 NaN 1.5
2 1.0 NaN 0.0
------------------------------
c1 c2 c3
0 1.0 NaN 3.0
1 1.0 NaN 3.0
2 1.0 NaN 0.0
df = pd. DataFrame( [ [ 1 , 2 , 3 ] , [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] ] , columns= [ 'c1' , 'c2' , 'c3' ] )
print ( df)
print ( '-' * 30 )
print ( df. drop_duplicates( ) )
c1 c2 c3
0 1 2 3
1 1 2 3
2 4 5 6
------------------------------
c1 c2 c3
0 1 2 3
2 4 5 6
df = pd. DataFrame( { 'c1' : [ 3 , 10 , 5 , 7 , 1 , 9 , 93 ] ,
'c2' : [ 15 , 16 , 14 , 78 , 19 , 11 , 8 ] ,
'c3' : [ 20 , 15 , 18 , 21 , 101 , 27 , 29 ] } ,
columns= [ 'c1' , 'c2' , 'c3' ] )
print ( df)
import matplotlib. pyplot as plt
print ( '-' * 30 )
df. boxplot( )
plt. show( )
z = lambda x: ( x- x. mean( ) ) / x. std( ddof= 0 )
print ( '-' * 30 )
print ( df[ df. apply ( z) > 2 ] . dropna( how= 'all' ) )
c1 c2 c3
0 3 15 20
1 10 16 15
2 5 14 18
3 7 78 21
4 1 19 101
5 9 11 27
6 93 8 29
------------------------------
------------------------------
c1 c2 c3
3 NaN 78.0 NaN
4 NaN NaN 101.0
6 93.0 NaN NaN
多特征操作
多重共线性
from sklearn. datasets import load_iris
iris = load_iris( )
X = pd. DataFrame( iris. data)
X. columns= [ 'sl' , 'sw' , 'pl' , 'pw' ]
y = pd. DataFrame( iris. target)
y. columns = [ 'y' ]
df = pd. concat( [ pd. DataFrame( X) , pd. DataFrame( y) ] , axis= 1 )
print ( df. head( ) )
sl sw pl pw y
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
df. corr( )
sl sw pl pw y sl 1.000000 -0.117570 0.871754 0.817941 0.782561 sw -0.117570 1.000000 -0.428440 -0.366126 -0.426658 pl 0.871754 -0.428440 1.000000 0.962865 0.949035 pw 0.817941 -0.366126 0.962865 1.000000 0.956547 y 0.782561 -0.426658 0.949035 0.956547 1.000000
from statsmodels. stats. outliers_influence import variance_inflation_factor
vif = [ variance_inflation_factor( X. values, X. columns. get_loc( i) ) for i in X. columns]
print ( vif)
X1= df. drop( columns= [ 'sl' , 'y' ] )
vif = [ variance_inflation_factor( X1. values, X1. columns. get_loc( i) ) for i in X1. columns]
print ( '-' * 30 )
print ( vif)
X2= df. drop( columns= [ 'sl' , 'pl' , 'y' ] )
vif = [ variance_inflation_factor( X2. values, X2. columns. get_loc( i) ) for i in X2. columns]
print ( '-' * 30 )
print ( vif)
[262.9693482414677, 96.35329172369063, 172.96096155387588, 55.50205979323753]
------------------------------
[5.856964572603174, 62.071308334041554, 43.2925737234071]
------------------------------
[2.891774016941542, 2.8917740169415427]
降维
from sklearn. decomposition import PCA
PCA( n_components= 2 ) . fit_transform( iris. data) [ 0 : 3 ]
array([[-2.68412563, 0.31939725],
[-2.71414169, -0.17700123],
[-2.88899057, -0.14494943]])
from sklearn. discriminant_analysis import LinearDiscriminantAnalysis as LDA
LDA( n_components= 2 ) . fit_transform( iris. data, iris. target) [ 0 : 3 ]
array([[ 8.06179978, 0.30042062],
[ 7.12868772, -0.78666043],
[ 7.48982797, -0.26538449]])
构造新特征
from sklearn. preprocessing import PolynomialFeatures
pf = PolynomialFeatures( ) . fit_transform( iris. data)
print ( pf[ 0 : 3 ] )
print ( '-' * 30 )
print ( pf. shape)
[[ 1. 5.1 3.5 1.4 0.2 26.01 17.85 7.14 1.02 12.25 4.9 0.7
1.96 0.28 0.04]
[ 1. 4.9 3. 1.4 0.2 24.01 14.7 6.86 0.98 9. 4.2 0.6
1.96 0.28 0.04]
[ 1. 4.7 3.2 1.3 0.2 22.09 15.04 6.11 0.94 10.24 4.16 0.64
1.69 0.26 0.04]]
------------------------------
(150, 15)
特征选择
import toad
toad. selection. select( df, df. y,
empty= 0.7 , iv= 0.1 ,
corr= 0.95 ,
return_drop= True )
( sl sw pl y
0 5.1 3.5 1.4 0
1 4.9 3.0 1.4 0
2 4.7 3.2 1.3 0
3 4.6 3.1 1.5 0
4 5.0 3.6 1.4 0
.. ... ... ... ..
145 6.7 3.0 5.2 2
146 6.3 2.5 5.0 2
147 6.5 3.0 5.2 2
148 6.2 3.4 5.4 2
149 5.9 3.0 5.1 2
[150 rows x 4 columns],
{'empty': array([], dtype=float64),
'iv': array([], dtype=object),
'corr': array(['pw'], dtype=object)})
from sklearn. model_selection import train_test_split
train, test = train_test_split(
df, test_size= .3 , random_state= 0 )
np. seterr( divide= 'ignore' , invalid= 'ignore' )
toad. metrics. PSI( train, test) . sort_values( 0 )
y 0.081994
pl 0.316619
pw 0.418114
sw 0.425005
sl 0.762664
dtype: float64
toad. selection. stepwise( df,
df. y,
direction= 'both' ,
criterion= 'aic' ,
estimator= 'ols' ,
intercept= False ) . head( )
sl y 0 5.1 0 1 4.9 0 2 4.7 0 3 4.6 0 4 5.0 0
from sklearn. feature_selection import VarianceThreshold
VarianceThreshold( threshold= 3 ) . fit_transform( iris. data) [ 0 : 3 ]
array([[1.4],
[1.4],
[1.3]])
from sklearn. feature_selection import SelectKBest
from scipy. stats import pearsonr
r = lambda X, Y: np. array( list ( map ( lambda x: pearsonr( x, Y) [ 0 ] , X. T) ) ) . T
SelectKBest( r, k= 2 ) . fit_transform( iris. data, iris. target) [ 0 : 3 ]
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2]])
from sklearn. feature_selection import SelectKBest
from sklearn. feature_selection import chi2
SelectKBest( chi2, k= 2 ) . fit_transform( iris. data, iris. target) [ 0 : 3 ]
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2]])
from sklearn. feature_selection import SelectKBest
from sklearn import metrics
mic = metrics. mutual_info_score
g = lambda X, Y: np. array( list ( map ( lambda x: mic( x, Y) , X. T) ) ) . T
SelectKBest( g, k= 2 ) . fit_transform( iris. data, iris. target) [ 0 : 3 ]
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2]])
from sklearn. feature_selection import RFE
from sklearn. linear_model import LogisticRegression
RFE( estimator= LogisticRegression( solver= 'liblinear' ) ,
n_features_to_select= 2 ) . fit_transform( iris. data, iris. target) [ 0 : 3 ]
array([[3.5, 0.2],
[3. , 0.2],
[3.2, 0.2]])
from sklearn. feature_selection import SelectFromModel
from sklearn. linear_model import LogisticRegression
SelectFromModel( LogisticRegression( penalty= "l1" , C= 0.1 ,
solver= 'liblinear' ) ) . fit_transform( iris. data, iris. target) [ 0 : 3 ]
array([[5.1, 3.5, 1.4],
[4.9, 3. , 1.4],
[4.7, 3.2, 1.3]])
from sklearn. feature_selection import SelectFromModel
from sklearn. ensemble import GradientBoostingClassifier
SelectFromModel( GradientBoostingClassifier( ) ) . fit_transform( iris. data, iris. target) [ 0 : 3 ]
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2]])
样本不平衡
过采样
from imblearn. over_sampling import SMOTE
smote = SMOTE( random_state= 0 )
X_smotesampled, y_smotesampled = smote. fit_resample( iris. data, iris. target)
欠采样
from imblearn. under_sampling import RandomUnderSampler
rus = RandomUnderSampler( random_state= 0 )
X_undersampled, y_undersampled = rus. fit_resample( iris. data, iris. target)