金融风控实战——基于时间序列进行特征衍生

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

给大家写了35个函数,我们逐个来看一下

#最近p个月,inv>0的月份数
def Num(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.where(df>0,1,0).sum(axis=1)
    return inv+'_num'+str(p),auto_value

#最近p个月,inv=0的月份数
def Nmz(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.where(df==0,1,0).sum(axis=1)
    return inv+'_nmz'+str(p),auto_value

#最近p个月,inv>0的月份数是否>=1     
def Evr(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    arr=np.where(df>0,1,0).sum(axis=1)
    auto_value = np.where(arr,1,0)
    return inv+'_evr'+str(p),auto_value    

#最近p个月,inv均值
def Avg(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanmean(df,axis = 1 )
    return inv+'_avg'+str(p),auto_value    


#最近p个月,inv和
def Tot(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nansum(df,axis = 1)
    return inv+'_tot'+str(p),auto_value  

#最近(2,p+1)个月,inv和
def Tot2T(inv,p):
    df=data.loc[:,inv+'2':inv+str(p+1)]
    auto_value=df.sum(1)
    return inv+'_tot2t'+str(p),auto_value  


#最近p个月,inv最大值
def Max(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanmax(df,axis = 1)
    return inv+'_max'+str(p),auto_value 


#最近p个月,inv最小值
def Min(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanmin(df,axis = 1)
    return inv+'_min'+str(p),auto_value 

#最近p个月,最近一次inv>0到现在的月份数

def Msg(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    df_value=np.where(df>0,1,0)
    auto_value=[]
    for i in range(len(df_value)):
        row_value=df_value[i,:]
        if row_value.max()<=0:
            indexs='0'
            auto_value.append(indexs)
        else:
            indexs=1
            for j in row_value:
                if j>0:
                    break
                indexs+=1
            auto_value.append(indexs)
    return inv+'_msg'+str(p),auto_value
 

#最近p个月,最近一次inv=0到现在的月份数
def Msz(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    df_value=np.where(df==0,1,0)
    auto_value=[]
    for i in range(len(df_value)):
        row_value=df_value[i,:]
        if row_value.max()<=0:
            indexs='0'
            auto_value.append(indexs)
        else:
            indexs=1
            for j in row_value:
                if j>0:
                    break
                indexs+=1
            auto_value.append(indexs)
    return inv+'_msz'+str(p),auto_value   
    
#当月inv/(最近p个月inv的均值)
def Cav(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1']/np.nanmean(df,axis = 1 ) 
    return inv+'_cav'+str(p),auto_value 

#当月inv/(最近p个月inv的最小值)
def Cmn(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1']/np.nanmin(df,axis = 1 ) 
    return inv+'_cmn'+str(p),auto_value 

#最近p个月,每两个月间的inv的增长量的最大值
def Mai(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)
        auto_value.append(np.nanmax(value_lst))     
    return inv+'_mai'+str(p),auto_value 

#最近p个月,每两个月间的inv的减少量的最大值
def Mad(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])      
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k+1] - df_value[k]
            value_lst.append(minus)
        auto_value.append(np.nanmax(value_lst))     
    return inv+'_mad'+str(p),auto_value 

#最近p个月,inv的标准差
def Std(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanstd(df,axis = 1)
    return inv+'_std'+str(p),auto_value 

    
#最近p个月,inv的变异系数
def Cva(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanstd(df,axis = 1 )/np.nanmean(df,axis = 1)
    return inv+'_cva'+str(p),auto_value 



#(当月inv) - (最近p个月inv的均值)
def Cmm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1'] - np.nanmean(df,axis = 1 ) 
    return inv+'_cmm'+str(p),auto_value 

#(当月inv) - (最近p个月inv的最小值)
def Cnm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1'] - np.nanmin(df,axis = 1 ) 
    return inv+'_cnm'+str(p),auto_value 


#(当月inv) - (最近p个月inv的最大值)
def Cxm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1'] - np.nanmax(df,axis = 1 ) 
    return inv+'_cxm'+str(p),auto_value 



#( (当月inv) - (最近p个月inv的最大值) ) / (最近p个月inv的最大值) )
def Cxp(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    temp = np.nanmin(df,axis = 1 )
    auto_value = (df[inv+'1'] - temp )/ temp
    return inv+'_cxp'+str(p),auto_value 

#最近p个月,inv的极差
def Ran(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = np.nanmax(df,axis = 1 )  -  np.nanmin(df,axis = 1 ) 
    return inv+'_ran'+str(p),auto_value 


#最近min( Time on book,p )个月中,后一个月相比于前一个月增长了的月份数
def Nci(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)           
        value_ng = np.where(np.array(value_lst)>0,1,0).sum()
        auto_value.append(np.nanmax(value_ng))     
    return inv+'_nci'+str(p),auto_value 
   

#最近min( Time on book,p )个月中,后一个月相比于前一个月减少了的月份数
def Ncd(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)           
        value_ng = np.where(np.array(value_lst)<0,1,0).sum()
        auto_value.append(np.nanmax(value_ng))     
    return inv+'_ncd'+str(p),auto_value 
           

#最近min( Time on book,p )个月中,相邻月份inv 相等的月份数
def Ncn(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)           
        value_ng = np.where(np.array(value_lst)==0,1,0).sum()
        auto_value.append(np.nanmax(value_ng))     
    return inv+'_ncn'+str(p),auto_value    
 
#If  最近min( Time on book,p )个月中,对任意月份i ,都有 inv[i] > inv[i+1] ,
#即严格递增,且inv > 0则flag = 1 Else flag = 0
def Bup(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        index = 0
        for k in range(len(df_value)-1):
            if df_value[k] > df_value[k+1]:
                break
            index =+ 1
        if index == p:            
            value= 1    
        else:
            value = 0
        auto_value.append(value)     
    return inv+'_bup'+str(p),auto_value   

#If  最近min( Time on book,p )个月中,对任意月份i ,都有 inv[i] < inv[i+1] ,
#即严格递减,且inv > 0则flag = 1 Else flag = 0
def Pdn(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        index = 0
        for k in range(len(df_value)-1):
            if df_value[k+1] > df_value[k]:
                break
            index =+ 1
        if index == p:            
            value= 1    
        else:
            value = 0
        auto_value.append(value)     
    return inv+'_pdn'+str(p),auto_value            

#最近min( Time on book,p )个月,inv的修剪均值
def Trm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = []
    for i in range(len(df)):
        trm_mean = list(df.loc[i,:])
        trm_mean.remove(np.nanmax(trm_mean))
        trm_mean.remove(np.nanmin(trm_mean))
        temp=np.nanmean(trm_mean) 
        auto_value.append(temp)
    return inv+'_trm'+str(p),auto_value 

#( 当月inv - 最近p个月的inv最大值 ) / 最近p个月的inv中的最大值
def Cmx(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = (df[inv+'1'] - np.nanmax(df,axis = 1 )) /np.nanmax(df,axis = 1 ) 
    return inv+'_cmx'+str(p),auto_value 

#( 当月inv - 最近p个月的inv均值 ) / inv均值
def Cmp(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = (df[inv+'1'] - np.nanmean(df,axis = 1 )) /np.nanmean(df,axis = 1 ) 
    return inv+'_cmp'+str(p),auto_value 


#( 当月inv - 最近p个月的inv最小值 ) /inv最小值 
def Cnp(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = (df[inv+'1'] - np.nanmin(df,axis = 1 )) /np.nanmin(df,axis = 1 ) 
    return inv+'_cnp'+str(p),auto_value 


#最近min( Time on book,p )个月取最大值的月份距现在的月份数
def Msx(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    df['_max'] = np.nanmax(df,axis = 1)
    for i in range(1,p+1):
        df[inv+str(i)] = list(df[inv+str(i)] == df['_max'])
    del df['_max']
    df_value = np.where(df==True,1,0)
    auto_value=[]
    for i in range(len(df_value)):
        row_value=df_value[i,:]
        indexs=1
        for j in row_value:
            if j == 1:
                break
            indexs+=1
        auto_value.append(indexs)
    return inv+'_msx'+str(p),auto_value


#最近p个月的均值/((p,2p)个月的inv均值)
def Rpp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmean(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmean(df2,axis = 1 )   
    auto_value = value1/value2
    return inv+'_rpp'+str(p),auto_value    


#最近p个月的均值 - ((p,2p)个月的inv均值)
def Dpp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmean(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmean(df2,axis = 1 )   
    auto_value = value1 - value2
    return inv+'_dpp'+str(p),auto_value   

#(最近p个月的inv最大值)/ (最近(p,2p)个月的inv最大值)
def Mpp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmax(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmax(df2,axis = 1 )   
    auto_value = value1/value2
    return inv+'_mpp'+str(p),auto_value  
  
#(最近p个月的inv最小值)/ (最近(p,2p)个月的inv最小值)
def Npp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmin(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmin(df2,axis = 1 )   
    auto_value = value1/value2
    return inv+'_npp'+str(p),auto_value  

下面定义一个函数,直接封装上面的全部函数

 #首先执行下面的全部函数 
       
#定义批量调用双参数的函数        
def auto_var2(inv,p):
    #global data_new
    try:
        columns_name,values=Num(inv,p)
        data_new[columns_name]=values
    except:
           print("Num PARSE ERROR",inv,p)
    try:
        columns_name,values=Nmz(inv,p)
        data_new[columns_name]=values
    except:
           print("Nmz PARSE ERROR",inv,p)
    try:
        columns_name,values=Evr(inv,p)
        data_new[columns_name]=values
    except:
           print("Evr PARSE ERROR",inv,p)
    try:
        columns_name,values=Avg(inv,p)
        data_new[columns_name]=values
    except:
           print("Avg PARSE ERROR",inv,p)
    try:
        columns_name,values=Tot(inv,p)
        data_new[columns_name]=values
    except:
        print("Tot PARSE ERROR",inv,p) 
    try:
        columns_name,values=Tot2T(inv,p)
        data_new[columns_name]=values
    except:
        print("Tot2T PARSE ERROR",inv,p)        
    try:
        columns_name,values=Max(inv,p)
        data_new[columns_name]=values
    except:
        print("Max PARSE ERROR",inv,p)
    try:
        columns_name,values=Min(inv,p)
        data_new[columns_name]=values
    except:
        print("Min PARSE ERROR",inv,p)
    try:
        columns_name,values=Msg(inv,p)
        data_new[columns_name]=values
    except:
        print("Msg PARSE ERROR",inv,p)
    try:
        columns_name,values=Msz(inv,p)
        data_new[columns_name]=values
    except:
        print("Msz PARSE ERROR",inv,p)
    try:
        columns_name,values=Cav(inv,p)
        data_new[columns_name]=values
    except:
        print("Cav PARSE ERROR",inv,p)
    try:
        columns_name,values=Cmn(inv,p)
        data_new[columns_name]=values
    except:
        print("Cmn PARSE ERROR",inv,p)           
    try:
        columns_name,values=Mai(inv,p)
        data_new[columns_name]=values
    except:
        print("Mai PARSE ERROR",inv,p)    
    try:
        columns_name,values=Mad(inv,p)
        data_new[columns_name]=values
    except:
        print("Mad PARSE ERROR",inv,p)
    try:
        columns_name,values=Std(inv,p)
        data_new[columns_name]=values
    except:
        print("Std PARSE ERROR",inv,p)   
    try:
        columns_name,values=Cva(inv,p)
        data_new[columns_name]=values
    except:
        print("Cva PARSE ERROR",inv,p)   
    try:
        columns_name,values=Cmm(inv,p)
        data_new[columns_name]=values
    except:
        print("Cmm PARSE ERROR",inv,p)  
    try:
        columns_name,values=Cnm(inv,p)
        data_new[columns_name]=values
    except:
        print("Cnm PARSE ERROR",inv,p)         
    try:
        columns_name,values=Cxm(inv,p)
        data_new[columns_name]=values
    except:
        print("Cxm PARSE ERROR",inv,p)          
    try:
        columns_name,values=Cxp(inv,p)
        data_new[columns_name]=values
    except:
        print("Cxp PARSE ERROR",inv,p)
    try:
        columns_name,values=Ran(inv,p)
        data_new[columns_name]=values
    except:
        print("Ran PARSE ERROR",inv,p)
    try:
        columns_name,values=Nci(inv,p)
        data_new[columns_name]=values
    except:
        print("Nci PARSE ERROR",inv,p)
    try:
        columns_name,values=Ncd(inv,p)
        data_new[columns_name]=values
    except:
        print("Ncd PARSE ERROR",inv,p)
    try:
        columns_name,values=Ncn(inv,p)
        data_new[columns_name]=values
    except:
        print("Ncn PARSE ERROR",inv,p)
    try:
        columns_name,values=Bup(inv,p)
        data_new[columns_name]=values
    except:
        print("Bup PARSE ERROR",inv,p)
    try:
        columns_name,values=Pdn(inv,p)
        data_new[columns_name]=values
    except:
        print("Pdn PARSE ERROR",inv,p) 
    try:
        columns_name,values=Trm(inv,p)
        data_new[columns_name]=values
    except:
        print("Trm PARSE ERROR",inv,p)  
    try:
        columns_name,values=Cmx(inv,p)
        data_new[columns_name]=values
    except:
        print("Cmx PARSE ERROR",inv,p)         
    try:
        columns_name,values=Cmp(inv,p)
        data_new[columns_name]=values
    except:
        print("Cmp PARSE ERROR",inv,p)   
    try:
        columns_name,values=Cnp(inv,p)
        data_new[columns_name]=values
    except:
        print("Cnp PARSE ERROR",inv,p) 
    try:
        columns_name,values=Msx(inv,p)
        data_new[columns_name]=values
    except:
        print("Msx PARSE ERROR",inv,p)
    try:
        columns_name,values=Rpp(inv,p)
        data_new[columns_name]=values
    except:
        print("Rpp PARSE ERROR",inv,p)
    try:
        columns_name,values=Dpp(inv,p)
        data_new[columns_name]=values
    except:
        print("Dpp PARSE ERROR",inv,p)
    try:
        columns_name,values=Mpp(inv,p)
        data_new[columns_name]=values
    except:
        print("Mpp PARSE ERROR",inv,p)
    try:
        columns_name,values=Npp(inv,p)
        data_new[columns_name]=values
    except:
        print("Npp PARSE ERROR",inv,p)
    return data_new.columns.size

然后我们用一个小demo来实验一下

import pandas as pd
#data是配好特征和标签的原始数据集
data = pd.read_excel('/Users/zhucan/Desktop/金融风控实战/第三课资料/textdata.xlsx')
data
""" ft 和 gt 表示两个变量名 1-12 表示对应12个月中每个月的相应数值  2019.4.18-2019.5.18"""
'''ft1 指的是 离申请当天一个月内的数据计算出来的加油次数  2019.3.18-2019.4.18'''
'''ft2 指的是 离申请当天30天到60天内的数据计算出的ft'''
'''gt1 指的是 离申请当天一个月内的数据计算出来的加油金额'''

data.columns
#Index(['customer_id', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
#       'ft9', 'ft10', 'ft11', 'ft12', 'TOB', 'gt1', 'gt2', 'gt3', 'gt4', 'gt5',
#       'gt6', 'gt7', 'gt8', 'gt9', 'gt10', 'gt11', 'gt12'],
#      dtype='object')
data_new = data.copy()
p = 4
inv = 'ft'
auto_data = pd.DataFrame()
for p in range(1,13):
    for inv in ['ft','gt']:
        auto_var2(inv,p)
# Mai PARSE ERROR ft 1
# Mad PARSE ERROR ft 1
# Trm PARSE ERROR ft 1
# Mai PARSE ERROR gt 1
# Mad PARSE ERROR gt 1
# Trm PARSE ERROR gt 1
# Trm PARSE ERROR ft 2
# Trm PARSE ERROR gt 2
# Rpp PARSE ERROR ft 7
# Dpp PARSE ERROR ft 7
# Mpp PARSE ERROR ft 7
# Npp PARSE ERROR ft 7
# Rpp PARSE ERROR gt 7
# Dpp PARSE ERROR gt 7
# Mpp PARSE ERROR gt 7
# Npp PARSE ERROR gt 7
# Rpp PARSE ERROR ft 8
# Dpp PARSE ERROR ft 8
# Mpp PARSE ERROR ft 8
# Npp PARSE ERROR ft 8
# Rpp PARSE ERROR gt 8
# Dpp PARSE ERROR gt 8
# Mpp PARSE ERROR gt 8
# Npp PARSE ERROR gt 8
# Rpp PARSE ERROR ft 9
# Dpp PARSE ERROR ft 9
# Mpp PARSE ERROR ft 9
# Npp PARSE ERROR ft 9
# Rpp PARSE ERROR gt 9
# Dpp PARSE ERROR gt 9
# Mpp PARSE ERROR gt 9
# Npp PARSE ERROR gt 9
# Rpp PARSE ERROR ft 10
# Dpp PARSE ERROR ft 10
# Mpp PARSE ERROR ft 10
# Npp PARSE ERROR ft 10
# Rpp PARSE ERROR gt 10
# Dpp PARSE ERROR gt 10
# Mpp PARSE ERROR gt 10
# Npp PARSE ERROR gt 10
# Rpp PARSE ERROR ft 11
# Dpp PARSE ERROR ft 11
# Mpp PARSE ERROR ft 11
# Npp PARSE ERROR ft 11
# Rpp PARSE ERROR gt 11
# Dpp PARSE ERROR gt 11
# Mpp PARSE ERROR gt 11
# Npp PARSE ERROR gt 11
# Tot2T PARSE ERROR ft 12
# Rpp PARSE ERROR ft 12
# Dpp PARSE ERROR ft 12
# Mpp PARSE ERROR ft 12
# Npp PARSE ERROR ft 12
# Tot2T PARSE ERROR gt 12
# Rpp PARSE ERROR gt 12
# Dpp PARSE ERROR gt 12
# Mpp PARSE ERROR gt 12
# Npp PARSE ERROR gt 12
data_new.shape
#(5, 808)
data_new.columns
#Index(['customer_id', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
#       'ft9',
#       ...
#       'gt_nci12', 'gt_ncd12', 'gt_ncn12', 'gt_bup12', 'gt_pdn12', 'gt_trm12',
#       'gt_cmx12', 'gt_cmp12', 'gt_cnp12', 'gt_msx12'],
#      dtype='object', length=808)
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值