特征处理+frequency

大佬喝可乐

已于 2022-01-19 00:09:06 修改

阅读量205

点赞数

分类专栏：数据分析机器学习文章标签： python

于 2022-01-18 15:19:53 首次发布

本文链接：https://blog.csdn.net/weixin_44322171/article/details/122560801

版权

数据分析同时被 2 个专栏收录

23 篇文章 1 订阅

订阅专栏

机器学习

14 篇文章 0 订阅

订阅专栏

产品+产品流水

产品

#################################

dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dict_1 = {'2021-07-01':['202105', '202106'], '2021-08-01':['202106', '202107'], 
          '2021-09-01':['202107', '202108'], '2021-10-01':['202108', '202109']}

data = p_record.copy()
dfs = []

freq_lis = ['业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel','deal_bus_fund', 'deal_bus_c_f']
            
#前：一个月、两个月；当月前、一个月前、两个月前,reduce
# redu_b  = (data['date'] <= dict_[month])
# redu_b1 = (data['date'] < dict_[month])
# redu_b2 = (data['date'] < dict_1[month][0])
# redu_1  = (data['date'] == dict_[month])
# redu_2  = (data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])

split_month = ['redu_1', 'redu_2', 'redu_b', 'redu_b1', 'redu_b2']

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]
    #5种不同月份拆分
    #111111####################
    stat                             = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat.columns                     = ['core_cust_id', 'uid_count_1']
    stat['pid_nunique_1']     = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat['pid_mean_count_1']  = stat[f'uid_count_1'] / stat[f'pid_nunique_1']
    stat['apply_mean_1']      = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat['apply_max_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat['apply_min_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat['apply_std_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat['apply_median_1']    = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat['apply_sum_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat['apply_max_min_1']    = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat['uflow_1']           = data[data['date'] == dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
    #stat[f'pflow_{mont}']           = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat[f'flow_{mont}']        = data[data['date'] == dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values

    #2222222###########
    stat1                             = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat1.columns                     = ['core_cust_id', f'uid_count_2']
    stat1['pid_nunique_2']     = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat1['pid_mean_count_2']  = stat1['uid_count_2'] / stat1['pid_nunique_2']
    stat1['apply_mean_2']      = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat1['apply_max_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat1['apply_min_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat1['apply_std_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat1['apply_median_2']    = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat1['apply_sum_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat1['apply_max_min_2']    = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat1['uflow_{mont}']           = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['流水号'].agg('count').values
#         stat1[f'pflow_{mont}']           = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat1[f'flow_{mont}']        = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    #333333333##################################
    stat2                             = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat2.columns                     = ['core_cust_id', f'uid_count_b']
    stat2['pid_nunique_b']     = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat2['pid_mean_count_b']  = stat2['uid_count_b'] / stat2['pid_nunique_b']
    stat2['apply_mean_b']      = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat2['apply_max_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat2['apply_min_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat2['apply_std_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat2['apply_median_b']    = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat2['apply_sum_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat2['apply_max_min_b']    = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat2['uflow_b']           = data[data['date'] <= dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
#         stat2[f'pflow_{mont}']           = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat2[f'flow_{mont}']        = data[data['date'] <= dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    #4444444444#######################
    stat3                             = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat3.columns                     = ['core_cust_id', 'uid_count_b1']
    stat3['pid_nunique_b1']     = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat3['pid_mean_count_{mont}']  = stat3['uid_count_b1'] / stat3['pid_nunique_b1']
    stat3['apply_mean_b1']      = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat3['apply_max_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat3['apply_min_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat3['apply_std_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat3['apply_median_b1']    = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat3['apply_sum_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat3['apply_max_min_b1']    = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat3['uflow_b1']           = data[data['date'] < dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
#         stat3[f'pflow_{mont}']           = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat3[f'flow_{mont}']        = data[data['date'] < dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    #5555555555555#############################
    stat4                             = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat4.columns                     = ['core_cust_id', f'uid_count_b2']
    stat4['pid_nunique_b2']     = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat4['pid_mean_count_b2']  = stat4['uid_count_b2'] / stat4['pid_nunique_b2']
    stat4['apply_mean_b2']      = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat4['apply_max_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat4['apply_min_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat4['apply_std_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat4['apply_median_b2']    = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat4['apply_sum_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat4['apply_max_min_b2']    = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat4['uflow_b2']           = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['流水号'].agg('count').values
#         stat4[f'pflow_{mont}']           = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat4[f'flow_{mont}']        = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', freq])['流水号'].agg('count').values

    #########################
    stat_5 = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_5.columns = ['core_cust_id', 'prod_code', 'flow_redu_b1']  
    stat_6 = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_6.columns = ['core_cust_id', 'prod_code', 'flow_redu_b']
    stat_7 = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_7.columns = ['core_cust_id', 'prod_code', 'flow_redu_b2']
    stat_8 = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_8.columns = ['core_cust_id', 'prod_code', 'flow_redu_1']
    stat_9 = data[(data['date'] == dict_[month][0]) & (data['date'] == dict_[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_9.columns = ['core_cust_id', 'prod_code', 'flow_redu_2']
    
    
            
    tmp_df = tmp_df.merge(stat, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat1, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat2, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat3, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat4, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat5, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat6, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat7, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat8, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat9, on= ['core_cust_id', 'prod_code'], how='left')
    
    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)

加（#############）是更改过的

product = pd.read_pickle('product.pkl')

#产品的再次分类
product['prod_4'] = product['prod_code'].apply(lambda x:x[:4])
product['prod_3'] = product['prod_code'].apply(lambda x:x[:3])
product['prod_1'] = product['prod_code'].apply(lambda x:x[:1])

#收益风险比
product['收益风险比'] = product['预期收益率'] / product['风险等级']

#持有天数和收益率关系
product['天数收益比'] = product['预期收益率'] / product['持有天数']

###############################################
product.rename(columns={'prod_class':'a2'}, inplace=True)
#加category特征频度统计,
#product_counts = product.groupby(['a2'])['计价类型'].value_counts().reset_index()
count_lis_p = ['计价类型', '周期类型', '模式', '风险等级', '是否允许变更分红方式', '管理方式', '业务模式', '收益特点', '期限', '投资模式', '展示等级']
for fea in count_lis_p:
    product[f'{fea}_count'] = product.groupby(['a2', fea])['prod_code'].transform(lambda x:x.count()).values.tolist()
#product['计价类型_count'] = product.groupby(['a2', '计价类型'])['prod_code'].transform(lambda x:x.count()).values.tolist()
###############################################

com_pid = set(product['prod_code'].unique()).intersection(set(df['prod_code'].unique()))  # 求交集
tmp = product[product['prod_code'].isin(com_pid)].describe().T
useful_cols = list(tmp[(tmp['std'] != 0) & (tmp['std'] != np.nan)].index) # 判断标准差
useful_cols = [c for c in useful_cols if c not in ['数据日期']] 
print(useful_cols)

if len(useful_cols) > 0:
    #########################################################
    df = df.merge(product[['prod_code']+useful_cols], on=['prod_code', 'a2'], how='left')

产品流水

p_record = pd.read_pickle('prod_record.pkl')
#日期的年月
p_record['date'] = p_record['trade_date'].apply(lambda x: str(x)[:6])
#申请金额apply_amont
p_record.rename(columns={'申请金额': 'apply_amt'}, inplace=True)
#交易状态 + 资金状态 + 业务代码 + 渠道标识 排序组合
p_record['deal_bus'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str')
p_record['deal_bus_channel'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str') + p_record['渠道标识'].astype('str')
p_record['deal_bus_fund'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str') + p_record['资金状态'].astype('str')
p_record['deal_bus_c_f'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str') + p_record['渠道标识'].astype('str') + p_record['资金状态'].astype('str')
#p_record['deal_bus'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str')
#整个月的平均值与最大值或最小值相比，如果等，那就是不买，如果不等，那就购买

p_record.rename(columns={'trade_date':'a3'},inplace = True)
p_record.rename(columns={'prod_class':'a2'},inplace = True)

df = df.merge(p_record, on= ['core_cust_id','prod_code','a3','a2'], how='left')

def diff_max_min(x):
    return x.max() - x.min()

dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dict_1 = {'2021-07-01':['202105', '202106'], '2021-08-01':['202106', '202107'], 
          '2021-09-01':['202107', '202108'], '2021-10-01':['202108', '202109']}

data = p_record.copy()
dfs = []

freq_lis = [业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel','deal_bus_fund', 'deal_bus_c_f']
            
#前：一个月、两个月；当月前、一个月前、两个月前,reduce
redu_b  = (data['date'] <= dict_[month])
redu_b1 = (data['date'] < dict_[month])
redu_b2 = (data['date'] < dict_1[month][0])
redu_1  = (data['date'] == dict_[month])
redu_2  = (data['date'] == dict_1[month])

split_month = [reduce_1, reduce_2, reduce_b, reduce_b1, reduce_b2]

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]
    #5种不同月份拆分
    for i, month in enumerate(split_month):
        con = split_month[i]
        stat                             = data[con].groupby('core_cust_id')['prod_code'].count().reset_index()
        stat.columns                     = ['core_cust_id', f'uid_count_{month}']
        stat[f'pid_nunique_{month}']     = data[con].groupby('core_cust_id')['prod_code'].agg('nunique').values
        stat[f'pid_mean_count_{month}']  = stat_1[f'uid_count{month}'] / stat_1[f'pid_nunique{month}']
        stat[f'apply_mean_{month}']      = data[con].groupby('core_cust_id')['apply_amt'].agg('mean').values
        stat[f'apply_max_{month}']       = data[con].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
        stat[f'apply_min_{month}']       = data[con].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
        stat[f'apply_std_{month}']       = data[con].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
        stat[f'apply_median_{month}']    = data[con].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
        stat[f'apply_sum_{month}']       = data[con].groupby('core_cust_id')['apply_amt'].agg('sum').values
        stat[f'apply_max_min{month}']    = data[con].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
        #频度统计
        stat[f'uflow_{month}']           = data[con].groupby('core_cust_id')['流水号'].agg('count').values
        stat[f'pflow_{month}']           = data[con].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
        for freq in freq_lis:
            stat[f'flow_{month}']        = data[con].groupby(['core_cust_id', freq])['流水号'].agg('count').values
            
    tmp_df = tmp_df.merge(stat, on= ['core_cust_id','prod_code'], how='left')
    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)

R点击表

r = pd.read_csv(data_path_2 + 'r.csv')
r['date'] = r['r5'].apply(lambda x: x[:7])

dict_  = {'2021-07-01':'2021-06', '2021-08-01':'2021-07', '2021-09-01':'2021-08', '2021-10-01':'2021-09'}
dict_1 = {'2021-07-01':['2021-05', '2021-06'], '2021-08-01':['2021-06', '2021-07'], 
          '2021-09-01':['2021-07', '2021-08'], '2021-10-01':['2021-08', '2021-09']}

dfs = []

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]

    stat_1 = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat_1.columns = ['core_cust_id','uid_click_prod_count']
    stat_1['pid_click_nunique_all'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat_1['uid_click_all']         = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['r1'].agg('count').values
    ###################
    groupby(id), 频度统计、统计量
    groupby(id, prod), 频度统计、统计量
    ###################
    tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')

    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)

大佬喝可乐

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
特征处理+frequency

产品+产品流水产品产品流水R点击表产品加（#############）是更改过的product = pd.read_pickle('product.pkl')#产品的再次分类product['prod_4'] = product['prod_code'].apply(lambda x:x[:4])product['prod_3'] = product['prod_code'].apply(lambda x:x[:3])product['prod_1'] = product['prod_cod
复制链接

扫一扫

专栏目录