特征工程(上)

什么是特征?

在机器学习的背景下,特征是用来解释现象发生的单个特性或一组特性。 当这些特性转换为某种可度量的形式时,它们被称为特征。

举个例子,假设你有一个学生列表,这个列表里包含每个学生的姓名、学习小时数、IQ和之前考试的总分数。现在,有一个新学生,你知道他/她的学习小时数和IQ,但他/她的考试分数缺失,你需要估算他/她可能获得的考试分数。

在这里,你需要用IQ和study_hours构建一个估算分数缺失值的预测模型。所以,IQ和study_hours就成了这个模型的特征。

特征工程可能包含的内容

  • 基础特征构造
  • 数据预处理
  • 特征衍生
  • 特征变换
  • 特征筛选

这是一个完整的特征工程流程,但不是唯一的流程,每个过程都有可能会交换顺序,随着学习的加深,大家会慢慢体会到。

数据预处理

  • 数据去量纲(标准化、归一化处理)
  • 缺失值处理(确实比例高的特征可以直接删除,其他特征根据业务相关来决定使用什么来填补)
  • 离散型变量的编码
  • 连续型变量的离散化
  • 分位数,极值处理,我们最粗暴的方法就是将前后1%的值抹去
  • 分箱处理(等距分切分、等频切分

基于时间序列进行特征衍生

#最近p个月,inv>0的月份数
def Num(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.where(df>0,1,0).sum(axis=1)
    return inv+'_num'+str(p),auto_value

#最近p个月,inv=0的月份数
def Nmz(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.where(df==0,1,0).sum(axis=1)
    return inv+'_nmz'+str(p),auto_value


#最近p个月,inv>0的月份数是否>=1     
def Evr(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    arr=np.where(df>0,1,0).sum(axis=1)
    auto_value = np.where(arr,1,0)
    return inv+'_evr'+str(p),auto_value    

#最近p个月,inv均值
def Avg(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanmean(df,axis = 1 )
    return inv+'_avg'+str(p),auto_value    


#最近p个月,inv和
def Tot(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nansum(df,axis = 1)
    return inv+'_tot'+str(p),auto_value  


#最近(2,p+1)个月,inv和

def Tot2T(inv,p):
    df=data.loc[:,inv+'2':inv+str(p+1)]
    auto_value=df.sum(1)
    return inv+'_tot2t'+str(p),auto_value  


#最近p个月,inv最大值
def Max(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanmax(df,axis = 1)
    return inv+'_max'+str(p),auto_value 


#最近p个月,inv最小值
def Min(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanmin(df,axis = 1)
    return inv+'_min'+str(p),auto_value 

#最近p个月,最近一次inv>0到现在的月份数

def Msg(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    df_value=np.where(df>0,1,0)
    auto_value=[]
    for i in range(len(df_value)):
        row_value=df_value[i,:]
        if row_value.max()<=0:
            indexs='0'
            auto_value.append(indexs)
        else:
            indexs=1
            for j in row_value:
                if j>0:
                    break
                indexs+=1
            auto_value.append(indexs)
    return inv+'_msg'+str(p),auto_value
 

#最近p个月,最近一次inv=0到现在的月份数
def Msz(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    df_value=np.where(df==0,1,0)
    auto_value=[]
    for i in range(len(df_value)):
        row_value=df_value[i,:]
        if row_value.max()<=0:
            indexs='0'
            auto_value.append(indexs)
        else:
            indexs=1
            for j in row_value:
                if j>0:
                    break
                indexs+=1
            auto_value.append(indexs)
    return inv+'_msz'+str(p),auto_value   
    
#当月inv/(最近p个月inv的均值)
def Cav(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1']/np.nanmean(df,axis = 1 ) 
    return inv+'_cav'+str(p),auto_value 

#当月inv/(最近p个月inv的最小值)
def Cmn(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1']/np.nanmin(df,axis = 1 ) 
    return inv+'_cmn'+str(p),auto_value 

#最近p个月,每两个月间的inv的增长量的最大值
def Mai(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)
        auto_value.append(np.nanmax(value_lst))     
    return inv+'_mai'+str(p),auto_value 

#最近p个月,每两个月间的inv的减少量的最大值
def Mad(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])      
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k+1] - df_value[k]
            value_lst.append(minus)
        auto_value.append(np.nanmax(value_lst))     
    return inv+'_mad'+str(p),auto_value 

#最近p个月,inv的标准差
def Std(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanvar(df,axis = 1)
    return inv+'_std'+str(p),auto_value 

    
#最近p个月,inv的变异系数
def Cva(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanmean(df,axis = 1 )/np.nanvar(df,axis = 1)
    return inv+'_cva'+str(p),auto_value 



#(当月inv) - (最近p个月inv的均值)
def Cmm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1'] - np.nanmean(df,axis = 1 ) 
    return inv+'_cmm'+str(p),auto_value 

#(当月inv) - (最近p个月inv的最小值)
def Cnm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1'] - np.nanmin(df,axis = 1 ) 
    return inv+'_cnm'+str(p),auto_value 


#(当月inv) - (最近p个月inv的最大值)
def Cxm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1'] - np.nanmax(df,axis = 1 ) 
    return inv+'_cxm'+str(p),auto_value 



#( (当月inv) - (最近p个月inv的最大值) ) / (最近p个月inv的最大值) )
def Cxp(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    temp = np.nanmin(df,axis = 1 )
    auto_value = (df[inv+'1'] - temp )/ temp
    return inv+'_cxp'+str(p),auto_value 

#最近p个月,inv的极差
def Ran(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = np.nanmax(df,axis = 1 )  -  np.nanmin(df,axis = 1 ) 
    return inv+'_ran'+str(p),auto_value 


#最近min( Time on book,p )个月中,后一个月相比于前一个月增长了的月份数
def Nci(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)           
        value_ng = np.where(np.array(value_lst)>0,1,0).sum()
        auto_value.append(np.nanmax(value_ng))     
    return inv+'_nci'+str(p),auto_value 
   

#最近min( Time on book,p )个月中,后一个月相比于前一个月减少了的月份数
def Ncd(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)           
        value_ng = np.where(np.array(value_lst)<0,1,0).sum()
        auto_value.append(np.nanmax(value_ng))     
    return inv+'_ncd'+str(p),auto_value 
           

#最近min( Time on book,p )个月中,相邻月份inv 相等的月份数
def Ncn(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)           
        value_ng = np.where(np.array(value_lst)==0,1,0).sum()
        auto_value.append(np.nanmax(value_ng))     
    return inv+'_ncn'+str(p),auto_value    
 
#If  最近min( Time on book,p )个月中,对任意月份i ,都有 inv[i] > inv[i+1] ,
#即严格递增,且inv > 0则flag = 1 Else flag = 0
def Bup(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        index = 0
        for k in range(len(df_value)-1):
            if df_value[k] > df_value[k+1]:
                break
            index =+ 1
        if index == p:            
            value= 1    
        else:
            value = 0
        auto_value.append(value)     
    return inv+'_bup'+str(p),auto_value   

#If  最近min( Time on book,p )个月中,对任意月份i ,都有 inv[i] < inv[i+1] ,
#即严格递减,且inv > 0则flag = 1 Else flag = 0
def Pdn(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        index = 0
        for k in range(len(df_value)-1):
            if df_value[k+1] > df_value[k]:
                break
            index =+ 1
        if index == p:            
            value= 1    
        else:
            value = 0
        auto_value.append(value)     
    return inv+'_pdn'+str(p),auto_value            



#最近min( Time on book,p )个月,inv的修建均值
def Trm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = []
    for i in range(len(df)):
        trm_mean = list(df.loc[i,:])
        trm_mean.remove(np.nanmax(trm_mean))
        trm_mean.remove(np.nanmin(trm_mean))
        temp=np.nanmean(trm_mean) 
        auto_value.append(temp)
    return inv+'_trm'+str(p),auto_value 

#当月inv / 最近p个月的inv中的最大值
def Cmx(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = (df[inv+'1'] - np.nanmax(df,axis = 1 )) /np.nanmax(df,axis = 1 ) 
    return inv+'_cmx'+str(p),auto_value 

#( 当月inv - 最近p个月的inv均值 ) / inv均值
def Cmp(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = (df[inv+'1'] - np.nanmean(df,axis = 1 )) /np.nanmean(df,axis = 1 ) 
    return inv+'_cmp'+str(p),auto_value 


#( 当月inv - 最近p个月的inv最小值 ) /inv最小值 
def Cnp(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = (df[inv+'1'] - np.nanmin(df,axis = 1 )) /np.nanmin(df,axis = 1 ) 
    return inv+'_cnp'+str(p),auto_value 


#最近min( Time on book,p )个月取最大值的月份距现在的月份数
def Msx(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    df['_max'] = np.nanmax(df,axis = 1)
    for i in range(1,p+1):
        df[inv+str(i)] = list(df[inv+str(i)] == df['_max'])
    del df['_max']
    df_value = np.where(df==True,1,0)
    auto_value=[]
    for i in range(len(df_value)):
        row_value=df_value[i,:]
        indexs=1
        for j in row_value:
            if j == 1:
                break
            indexs+=1
        auto_value.append(indexs)
    return inv+'_msx'+str(p),auto_value


#最近p个月的均值/((p,2p)个月的inv均值)
def Rpp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmean(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmean(df2,axis = 1 )   
    auto_value = value1/value2
    return inv+'_rpp'+str(p),auto_value    


#最近p个月的均值 - ((p,2p)个月的inv均值)
def Dpp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmean(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmean(df2,axis = 1 )   
    auto_value = value1 - value2
    return inv+'_dpp'+str(p),auto_value   


#(最近p个月的inv最大值)/ (最近(p,2p)个月的inv最大值)
def Mpp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmax(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmax(df2,axis = 1 )   
    auto_value = value1/value2
    return inv+'_mpp'+str(p),auto_value  

  
#(最近p个月的inv最小值)/ (最近(p,2p)个月的inv最小值)
def Npp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmin(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmin(df2,axis = 1 )   
    auto_value = value1/value2
    return inv+'_npp'+str(p),auto_value  

然后我们用一个小demo来实验一下
原始数据特征标签:
在这里插入图片描述
经过特征衍生的特征标签:
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值