特征工程:
1.特征使用(数据的选择、数据的可用性)
2.特征获取(特征来源、特征存储)
3.特征处理(数据清洗、特征预处理)
4.特征监控(现有特征、新特征)
下面重点介绍一下特征处理
1.数据清洗
1)数据样本采集(抽样)
2)异常值(空值)处理
import pandas as pd
df=pd.DataFrame({'A':['a0','a1','a1','a2','a3','a4'],'B':['b0','b1','b2','b2','b3',None],
'C':[1,2,None,3,4,5],'D':[0.1,10.2,11.4,8.9,9.1,12],'E':[10,19,32,25,8,None],
'F':['f0','f1','g2','f3','f4','f5']})
df.isnull() #判断是否为空值,是为True,否为False
df.dropna() #去除含有空值的行
df.dropna(subset=['B'])#去除某一行,比如列B为空值的行
df.duplicated(['A']) #查看A列是否有重复值
df.duplicated(['A','B']) #A,B均重复才为True
df.drop_duplicates(['A']) #去除重复的某一行,默认第一行
df.drop_duplicates(['A'],keep=False)#去除重复的所有行,keep='first','last',False,默认为first
df.fillna('b*') #填充b*
df.fillna(df['E'].mean())
df['E'].interpolate()#插值,可选择method
upper_q=df['D'].quantile(0.75)
lower_q=df['D'].quantile(0.25)
q_inter=upper_q-lower_q
k=1.5
df[df['D']>lower_q-k*q_inter][df['D']<upper_q+k*q_inter] #去除异常值
df[[True if item.startswith('f') else False for item in list(df['F'].values)]] #保留f开头的所有行
2.特征预处理
1)特征选择
剔除与标注不相关或者冗余的特征
特征选择是数据归约的一种思路(另一种思路为抽样)
三种思路:1)过滤思想(阈值设置比较灵活)
2) 包裹思想(遍历特征子集,常用方法:RFE算法)
3)嵌入思想(建立简单回归模型)
import numpy as np
import pandas as pd
import scipy.stats as ss
df=pd.DataFrame({'A':ss.norm.rvs(size=10),'B':ss.norm.rvs(size=10),'C':ss.norm.rvs(size=10),
'D':np.random.randint(low=0,high=2,size=10)})#只能取到最小值,
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
X=df.loc[:,['A','B','C']]
Y=df.loc[:,'D']
from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel #过滤思想,包裹思想,嵌入思想常用的类
#过滤思想
skb=SelectKBest(k=2)
skb.fit(X,Y)#可指定score_func函数,默认为f_classif
skb.transform(X)
#RFE
rfe=RFE(estimator=SVR(kernel='linear'),n_features_to_select=2,step=1)#最终选择的特征数,每一步去掉的特征数
rfe.fit_transform(X,Y)
#嵌入思想
sfm=SelectFromModel(estimator=DecisionTreeRegressor(),threshold=0.1)#重要性因子低于多少被去掉
sfm.fit_transform(X,Y)
2)特征变换
1.对指化
numpy.log
numpy.exp
2.离散化
import numpy as np
import pandas as pd
lst=[6,8,10,15,16,24,25,40,67]
#等深分箱
pd.qcut(lst,q=3)
pd.qcut(lst,q=3,labels=['low','medium','high'])
#等宽分箱
pd.cut(lst,bins=3)
pd.cut(lst,bins=3,labels=['low','medium','high'])
3.归一化
from sklearn.preprocessing import MinMaxScaler,StandardScaler
MinMaxScaler().fit_transform(np.array([1,4,10,15,21]).reshape(-1,1))#满足一列
4.标准化
StandardScaler().fit_transform(np.array([1,1,1,0,0,0,0,0]).reshape(-1,1))
5.数值化
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
#LabelEncoder(标签化),定序数据
LabelEncoder().fit_transform(np.array(['Down','Up','Up','Down']).reshape(-1,1))
LabelEncoder().fit_transform(np.array(['Low','Medium','High','Medium','High']).reshape(-1,1))
#One-Hot Encoder
#OneHot编码,先进行LabelEncoder
lb_encoder=LabelEncoder()
lb_tran_f=lb_encoder.fit_transform(np.array(['Red','Green','Yellow','Blue']))
oht_encoder=OneHotEncoder().fit(lb_tran_f.reshape(-1,1))
oht_encoder.transform(lb_encoder.transform(np.array(['Yellow','Blue','Green','Green','Red'])).reshape(-1,1)).toarray()
#正规化
from sklearn.preprocessing import Normalizer
Normalizer(norm='l1').fit_transform(np.array([1,1,3,-1,2]).reshape(-1,1))#对行进行正规化
Normalizer(norm='l1').fit_transform(np.array([[1,1,3,-1,2]]))
Normalizer(norm='l2').fit_transform(np.array([[1,1,3,-1,2]]))
3)特征降维(LDA)
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X=np.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])
Y=np.array([1,1,1,2,2,2])
LinearDiscriminantAnalysis(n_components=1).fit_transform(X,Y)
#当作判别器来用,Fisher 分类器
clf=LinearDiscriminantAnalysis(n_components=1).fit(X,Y)
clf.predict([[0.8,1]])
4)特征衍生
3.实例
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
#sl:satisfaction_level---False:MinMaxScaler;True:StandardScaler(均值为0标准差为1)
#le:last_evaluation——False:MinMaxScaler;True:StandardScaler
#npr:number_project——False:MinMaxScaler;True:StandardScaler
#amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
#tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
#wa:Work_accident——False:MinMaxScaler;True:StandardScaler
#pl5:promotion_last_5years ——False:MinMaxScaler;True:StandardScaler
#dp:department——False:LabelEncoding,True:OneHotEncoding
#slr:salary——False:LabelEncoding,True:OneHotEncoding
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1):
df=pd.read_csv('/home/WLY/learn/Learn/HR.csv')
#2.清洗数据
df=df.dropna(subset=['satisfaction_level','last_evaluation'])#去除空值
df=df[df['satisfaction_level']<=1][df['salary']!='nme']#去除异常值
#1.得到标注
label=df['left']
df=df.drop('left',axis=1)#不指定axis=1时,将以行的形式删除
#3.特征选择(本实列中特征不多,不去除)
#4.特征处理
scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
'time_spend_company','Work_accident','promotion_last_5years']
for i in range(len(scaler_lst)):
if not scaler_lst[i]:
df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
#.reshape(-1,1)转换为列
else:
df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
#数值化处理
scaler_lst=[dp,slr]
column_lst=['department','salary']
for i in range(len(scaler_lst)):
if not scaler_lst[i]:
if column_lst[i]=='salary':
df[column_lst[i]]=[map_salary(s) for s in df['salary'].values]
else:
df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])
#注意low是最小值,直接进行LabelEncoder时,会按字母顺序升序处理,high=0,再写一个函数解决问题
df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
#进行完LabelEncoder后进行归一化或标准化处理
else:
df=pd.get_dummies(df,columns=[column_lst[i]])#处理df数据时,进行Onehotencoder的方法
#降维处理
if lower_d:
#LDA降维中文档说明n_components不能大于类的个数,本例中不考虑LDA,使用PCA降维
return PCA(n_components=ld_n).fit_transform(df.values),label
return df,label
d=dict([('low',0),('medium',1),('high',2)])#构造一个字典
def map_salary(s):
return d.get(s,0)
def main():
print(hr_preprocessing(sl=True,le=True,dp=True,lower_d=False,ld_n=3))
if __name__=='__main__': #__两个短杠
main()