申请评分卡——特征工程

最新推荐文章于 2021-03-16 06:00:52 发布

pandacode

最新推荐文章于 2021-03-16 06:00:52 发布

阅读量1.3k

点赞数

本文链接：https://blog.csdn.net/pandacode/article/details/82118977

版权

点击下载要使用的数据

读取数据并导入相关包

import pandas as pd
import datetime
import collections
import numpy as np
import random
#from sklearn.preprocessing import MDLP
from numpy import nan as NA
from pandas import Series
import os 
from pandas import DataFrame
from numpy import nan as NA

#os.chdir('E://kaggle//value//')
data1=pd.read_csv('PPD_LogInfo_3_1_Training_Set.csv')
data2=pd.read_csv('PPD_Training_Master_GBK_3_1_Training_Set.csv')
data3=pd.read_csv('PPD_Userupdate_Info_3_1_Training_Set.csv')

使用时间切片衍生特征

1、将成功借款前的一段时间内操作过的代码的数量和代码的类别的数量作为衍生特征。

2、将成功借款前的一段时间内总的操作次数作为衍生特征。

3、将成功借款前的一段时间内是否修改过特定信息作为衍生特征。

#字符串日期改为程序能识别的时间，并且将两个日期相减得到两个操作的间隔时间。
data1['Loginfo3_t']=data1['LogInfo3'].map(lambda x :datetime.datetime.strptime(x,'%Y-%m-%d'))
data1['listinginfo_t']=data1['Listinginfo1'].map(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d'))
data1['ListingGap'] = data1[['Loginfo3_t','listinginfo_t']].apply(lambda x :(x[1]-x[0]).days,axis=1)
print('最大时间间隔：',data1['ListingGap'].max())
#查看哪个时间间隔能包含最多样本
freq=TimeWindowSelection(data1,'ListingGap',range(30,720,30))
print(freq)
#在时间间隔180天的时候就可以看到包含了95%以上的样本，说明两个操作间使用者的操作都是180天内的，对180天进行时间切片提取特征最为合理
idx=list(set(list(set(data1['Idx']))+list(set(data2['Idx'].values))+list(set(data3['Idx'].values))))
#获取列名
LogInfo1_columns=GetColumns(data1,'LogInfo1')
LogInfo2_columns=GetColumns(data1,'LogInfo2')
#构建新数据库
data1_LogInfo1=DataFrame(np.zeros([len(idx),len(LogInfo1_columns)]),index=idx,columns=LogInfo1_columns)
data1_LogInfo2=DataFrame(np.zeros([len(idx),len(LogInfo2_columns)]),index=idx,columns=LogInfo2_columns)    
#借款前180天内所做的操作及所操作的代码和代码类别
data1['min_180']=data1['ListingGap'].map(lambda x : int(x<=180))
data1=data1.loc[data1['min_180']==1]
data1_set1=data1.set_index(['Idx','LogInfo1'],drop=False)
data1_set2=data1.set_index(['Idx','LogInfo2'],drop=False)
count=0
for i in data1['Idx'].unique():
    for j in data1_set1.ix[i,'LogInfo1'].unique():
        data1_LogInfo1.ix[i,'code_'+str(j)]=data1_set1.ix[(i,j),:].count()[0]
    count+=1
    print(count/len(data1['Idx'].unique()))
data1_LogInfo1.to_csv('data1_LogInfo1.csv')
count=0
for i in data1['Idx'].unique():
    for j in data1_set2.ix[i,'LogInfo2'].unique():
        data1_LogInfo2.ix[i,'code_'+str(j)]=data1_set2.ix[(i,j),:].count()[0]
    count+=1
    print(count/len(data1['Idx'].unique()))
change_columns=[]
for i in data1_LogInfo2.columns:
    change_columns.append(i+'_classes')
data1_LogInfo2.columns=change_columns
data1_LogInfo2.to_csv('data1_LogInfo2.csv')
#字符串日期改为程序能识别的时间，并且将两个日期相减得到两个操作的间隔时间。
data3['UserupdateInfo2_t']=data3['UserupdateInfo2'].map(lambda x :datetime.datetime.strptime(x,'%Y/%m/%d'))
data3['listinginfo_t']=data3['ListingInfo1'].map(lambda x:datetime.datetime.strptime(x,'%Y/%m/%d'))
data3['ListingGap'] = data3[['UserupdateInfo2_t','listinginfo_t']].apply(lambda x :(x[1]-x[0]).days,axis=1)
print('最大时间间隔：',data3['ListingGap'].max())
#查看哪个时间间隔能包含最多样本
freq=TimeWindowSelection(data3,'ListingGap',range(30,720,30))
print(freq)
#可以看出150天就包含95%的样本
data3['min_180']=data3['ListingGap'].map(lambda x : int(x<=180))
data3=data3.loc[data3['min_180']==1]
data3['UserupdateInfo1_l']=data3['UserupdateInfo1'].map(lambda x : x.lower())
#查看用户都修改了哪些信息，从中提取有用的特征
data3_UserupdateInfo1_l_columns=list(data3['UserupdateInfo1_l'].unique())
#较为应该注意的特征
notice

最低0.47元/天解锁文章

pandacode

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
4
评论
申请评分卡——特征工程

点击下载要使用的数据读取数据并导入相关包import pandas as pdimport datetimeimport collectionsimport numpy as npimport random#from sklearn.preprocessing import MDLPfrom numpy import nan as NAfrom pandas import ...
复制链接

扫一扫