R+产品流水

大佬喝可乐

已于 2022-01-19 21:38:22 修改

阅读量223

点赞数

分类专栏：数据分析机器学习文章标签： r语言开发语言 python

于 2022-01-19 17:20:32 首次发布

本文链接：https://blog.csdn.net/weixin_44322171/article/details/122585753

版权

数据分析同时被 2 个专栏收录

23 篇文章 1 订阅

订阅专栏

机器学习

14 篇文章 0 订阅

订阅专栏

流水+R的特征

流水

更新流水

dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dict_1 = {'2021-07-01':['202105', '202106'], '2021-08-01':['202106', '202107'], 
          '2021-09-01':['202107', '202108'], '2021-10-01':['202108', '202109']}

data = p_record.copy()
dfs = []

#freq_lis = [['业务代码'], ['渠道标识'], ['资金状态'], ['交易状态'], ['deal_bus'], ['deal_bus_channel'],['deal_bus_fund'], ['deal_bus_c_f']]
            
#前：一个月、两个月；当月前、一个月前、两个月前,reduce
# redu_b  = (data['date'] <= dict_[month])
# redu_b1 = (data['date'] < dict_[month])
# redu_b2 = (data['date'] < dict_1[month][0])
# redu_1  = (data['date'] == dict_[month])
# redu_2  = (data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])

split_month = ['redu_1', 'redu_2', 'redu_b', 'redu_b1', 'redu_b2']

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]
    #5种不同月份拆分
    #111111####################
    stat                             = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat.columns                     = ['core_cust_id', 'uid_count_1']
    stat['pid_nunique_1']     = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat['pid_mean_count_1']  = stat[f'uid_count_1'] / stat[f'pid_nunique_1']
    stat['apply_mean_1']      = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat['apply_max_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat['apply_min_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat['apply_std_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat['apply_median_1']    = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat['apply_sum_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat['apply_max_min_1']    = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat['uflow_1']           = data[data['date'] == dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
    #stat[f'pflow_{mont}']           = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat[f'flow_{mont}']        = data[data['date'] == dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values

    #2222222###########
    stat1                             = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat1.columns                     = ['core_cust_id', f'uid_count_2']
    stat1['pid_nunique_2']     = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat1['pid_mean_count_2']  = stat1['uid_count_2'] / stat1['pid_nunique_2']
    stat1['apply_mean_2']      = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat1['apply_max_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat1['apply_min_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat1['apply_std_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat1['apply_median_2']    = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat1['apply_sum_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat1['apply_max_min_2']    = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat1['uflow_2']           = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['流水号'].agg('count').values
#         stat1[f'pflow_{mont}']           = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat1[f'flow_{mont}']        = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    #333333333##################################
    stat2                             = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat2.columns                     = ['core_cust_id', f'uid_count_b']
    stat2['pid_nunique_b']     = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat2['pid_mean_count_b']  = stat2['uid_count_b'] / stat2['pid_nunique_b']
    stat2['apply_mean_b']      = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat2['apply_max_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat2['apply_min_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat2['apply_std_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat2['apply_median_b']    = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat2['apply_sum_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat2['apply_max_min_b']    = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat2['uflow_b']           = data[data['date'] <= dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
#         stat2[f'pflow_{mont}']           = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat2[f'flow_{mont}']        = data[data['date'] <= dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    #4444444444#######################
    stat3                             = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat3.columns                     = ['core_cust_id', 'uid_count_b1']
    stat3['pid_nunique_b1']     = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat3['pid_mean_count_{mont}']  = stat3['uid_count_b1'] / stat3['pid_nunique_b1']
    stat3['apply_mean_b1']      = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat3['apply_max_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat3['apply_min_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat3['apply_std_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat3['apply_median_b1']    = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat3['apply_sum_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat3['apply_max_min_b1']    = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat3['uflow_b1']           = data[data['date'] < dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
#         stat3[f'pflow_{mont}']           = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat3[f'flow_{mont}']        = data[data['date'] < dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    #5555555555555#############################
    stat4                             = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat4.columns                     = ['core_cust_id', f'uid_count_b2']
    stat4['pid_nunique_b2']     = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat4['pid_mean_count_b2']  = stat4['uid_count_b2'] / stat4['pid_nunique_b2']
    stat4['apply_mean_b2']      = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat4['apply_max_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat4['apply_min_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat4['apply_std_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat4['apply_median_b2']    = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat4['apply_sum_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat4['apply_max_min_b2']    = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat4['uflow_b2']           = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['流水号'].agg('count').values
#         stat4[f'pflow_{mont}']           = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat4[f'flow_{mont}']        = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    
    #以core_cust_id/prod_code来groupby,查看用户在该产品中产生多少笔交易
    #########################
    stat_5 = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_5.columns = ['core_cust_id', 'prod_code', 'flow_redu_b1']  
    stat_6 = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_6.columns = ['core_cust_id', 'prod_code', 'flow_redu_b']
    stat_7 = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_7.columns = ['core_cust_id', 'prod_code', 'flow_redu_b2']
    stat_8 = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_8.columns = ['core_cust_id', 'prod_code', 'flow_redu_1']
    stat_9 = data[(data['date'] == dict_[month][0]) & (data['date'] == dict_[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_9.columns = ['core_cust_id', 'prod_code', 'flow_redu_2']
    
    
    tmp_df = tmp_df.merge(stat, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat1, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat2, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat3, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat4, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat_5, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_6, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_7, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_8, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_9, on= ['core_cust_id', 'prod_code'], how='left')
    
    #以core_cust_id和freq_lis来groupby
    freq_lis = ['业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel','deal_bus_fund', 'deal_bus_c_f']
    #频度统计
    for freq in freq_lis:
        group = ['core_cust_id']
        group.append(freq)
        stat_count = data[data['date'] < dict_[month]].groupby(group)['流水号'].count().reset_index()
        #stat_count = data[data['date'] < dict_[month]].groupby(['core_cust_id'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_b1']
        tmp_df = tmp_df.merge(stat_count, on=group, how='left')
    for freq in freq_lis:
        group = ['core_cust_id']
        group.append(freq)
        stat_count = data[data['date'] <= dict_[month]].groupby(group)['流水号'].count().reset_index()
        stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_b']
        tmp_df = tmp_df.merge(stat_count, on=group, how='left')
    for freq in freq_lis:
        group = ['core_cust_id']
        group.append(freq)
        stat_count = data[data['date'] == dict_[month]].groupby(group)['流水号'].count().reset_index()
        stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_1']
        tmp_df = tmp_df.merge(stat_count, on=group, how='left')
    for freq in freq_lis:
        group = ['core_cust_id']
        group.append(freq)
        stat_count = data[data['date'] < dict_1[month][0]].groupby(group)['流水号'].count().reset_index()
        stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_b2']
        tmp_df = tmp_df.merge(stat_count, on=group, how='left')
    for freq in freq_lis:
        group = ['core_cust_id']
        group.append(freq)
        stat_count = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(group)['流水号'].count().reset_index()
        stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_2']
        tmp_df = tmp_df.merge(stat_count, on=group, how='left')
    
    #以core_cust_id和prod_code来groupby
    for freq in freq_lis:
        group = ['core_cust_id', 'prod_code']
        group.append(freq)
        stat_count = data[data['date'] < dict_[month]].groupby(group)['流水号'].count().reset_index()
        stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'{freq}_freq_b1']
        tmp_df = tmp_df.merge(stat_count, on=group, how='left')
    for freq in freq_lis:
        group = ['core_cust_id', 'prod_code']
        group.append(freq)
        stat_count = data[data['date'] <= dict_[month]].groupby(group)['流水号'].count().reset_index()
        stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'{freq}_freq_b']
        tmp_df = tmp_df.merge(stat_count, on=group, how='left')
    for freq in freq_lis:
        group = ['core_cust_id', 'prod_code']
        group.append(freq)
        stat_count = data[data['date'] == dict_[month]].groupby(group)['流水号'].count().reset_index()
        stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'{freq}_freq_b']
        tmp_df = tmp_df.merge(stat_count, on=group, how='left')
    for freq in freq_lis:
        group = ['core_cust_id', 'prod_code']
        group.append(freq)
        stat_count = data[data['date'] < dict_1[month][0]].groupby(group)['流水号'].count().reset_index()
        stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'{freq}_freq_b2']
        tmp_df = tmp_df.merge(stat_count, on=group, how='left')
    for freq in freq_lis:
        group = ['core_cust_id', 'prod_code']
        group.append(freq)
        stat_count = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(group)['流水号'].count().reset_index()
        stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'{freq}_freq_2']
        tmp_df = tmp_df.merge(stat_count, on=group, how='left')
        
    
    
    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)

旧的流水

dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dict_1 = {'2021-07-01':['202105', '202106'], '2021-08-01':['202106', '202107'], 
          '2021-09-01':['202107', '202108'], '2021-10-01':['202108', '202109']}

data = p_record.copy()
dfs = []

freq_lis = ['业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel','deal_bus_fund', 'deal_bus_c_f']
            
#前：一个月、两个月；当月前、一个月前、两个月前,reduce
# redu_b  = (data['date'] <= dict_[month])
# redu_b1 = (data['date'] < dict_[month])
# redu_b2 = (data['date'] < dict_1[month][0])
# redu_1  = (data['date'] == dict_[month])
# redu_2  = (data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])

split_month = ['redu_1', 'redu_2', 'redu_b', 'redu_b1', 'redu_b2']

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]
    #5种不同月份拆分
    #111111####################
    stat                             = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat.columns                     = ['core_cust_id', 'uid_count_1']
    stat['pid_nunique_1']     = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat['pid_mean_count_1']  = stat[f'uid_count_1'] / stat[f'pid_nunique_1']
    stat['apply_mean_1']      = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat['apply_max_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat['apply_min_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat['apply_std_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat['apply_median_1']    = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat['apply_sum_1']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat['apply_max_min_1']    = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat['uflow_1']           = data[data['date'] == dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
    #stat[f'pflow_{mont}']           = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat[f'flow_{mont}']        = data[data['date'] == dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values

    #2222222###########
    stat1                             = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat1.columns                     = ['core_cust_id', f'uid_count_2']
    stat1['pid_nunique_2']     = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat1['pid_mean_count_2']  = stat1['uid_count_2'] / stat1['pid_nunique_2']
    stat1['apply_mean_2']      = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat1['apply_max_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat1['apply_min_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat1['apply_std_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat1['apply_median_2']    = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat1['apply_sum_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat1['apply_max_min_2']    = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat1['uflow_2']           = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['流水号'].agg('count').values
#         stat1[f'pflow_{mont}']           = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat1[f'flow_{mont}']        = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    #333333333##################################
    stat2                             = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat2.columns                     = ['core_cust_id', f'uid_count_b']
    stat2['pid_nunique_b']     = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat2['pid_mean_count_b']  = stat2['uid_count_b'] / stat2['pid_nunique_b']
    stat2['apply_mean_b']      = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat2['apply_max_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat2['apply_min_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat2['apply_std_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat2['apply_median_b']    = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat2['apply_sum_b']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat2['apply_max_min_b']    = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat2['uflow_b']           = data[data['date'] <= dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
#         stat2[f'pflow_{mont}']           = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat2[f'flow_{mont}']        = data[data['date'] <= dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    #4444444444#######################
    stat3                             = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat3.columns                     = ['core_cust_id', 'uid_count_b1']
    stat3['pid_nunique_b1']     = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat3['pid_mean_count_{mont}']  = stat3['uid_count_b1'] / stat3['pid_nunique_b1']
    stat3['apply_mean_b1']      = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat3['apply_max_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat3['apply_min_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat3['apply_std_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat3['apply_median_b1']    = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat3['apply_sum_b1']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat3['apply_max_min_b1']    = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat3['uflow_b1']           = data[data['date'] < dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
#         stat3[f'pflow_{mont}']           = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat3[f'flow_{mont}']        = data[data['date'] < dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    #5555555555555#############################
    stat4                             = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat4.columns                     = ['core_cust_id', f'uid_count_b2']
    stat4['pid_nunique_b2']     = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat4['pid_mean_count_b2']  = stat4['uid_count_b2'] / stat4['pid_nunique_b2']
    stat4['apply_mean_b2']      = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('mean').values
    stat4['apply_max_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
    stat4['apply_min_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
    stat4['apply_std_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
    stat4['apply_median_b2']    = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
    stat4['apply_sum_b2']       = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('sum').values
    stat4['apply_max_min_b2']    = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
    #频度统计
    stat4['uflow_b2']           = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['流水号'].agg('count').values
#         stat4[f'pflow_{mont}']           = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
#         for freq in freq_lis:
#             stat4[f'flow_{mont}']        = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
    
    #以core_cust_id/prod_code来groupby,查看用户在该产品中产生多少笔交易
    #########################
    stat_5 = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_5.columns = ['core_cust_id', 'prod_code', 'flow_redu_b1']  
    stat_6 = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_6.columns = ['core_cust_id', 'prod_code', 'flow_redu_b']
    stat_7 = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_7.columns = ['core_cust_id', 'prod_code', 'flow_redu_b2']
    stat_8 = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_8.columns = ['core_cust_id', 'prod_code', 'flow_redu_1']
    stat_9 = data[(data['date'] == dict_[month][0]) & (data['date'] == dict_[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_9.columns = ['core_cust_id', 'prod_code', 'flow_redu_2']
    
    
    tmp_df = tmp_df.merge(stat, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat1, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat2, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat3, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat4, on= ['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat5, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat6, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat7, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat8, on= ['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat9, on= ['core_cust_id', 'prod_code'], how='left')
    
    #以core_cust_id和freq_lis来groupby
    #freq_lis = ['业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel','deal_bus_fund', 'deal_bus_c_f']
    #频度统计
    for freq in freq_lis:
        stat_count = data[data['date'] < dict_[month]].groupby(['core_cust_id'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', f'{freq}_freq_b1']
        tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
    for freq in freq_lis:
        stat_count = data[data['date'] <= dict_[month]].groupby(['core_cust_id'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', f'{freq}_freq_b']
        tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
    for freq in freq_lis:
        stat_count = data[data['date'] == dict_[month]].groupby(['core_cust_id'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', f'{freq}_freq_1']
        tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
    for freq in freq_lis:
        stat_count = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', f'{freq}_freq_b2']
        tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
    for freq in freq_lis:
        stat_count = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', f'{freq}_freq_2']
        tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
    
    #以core_cust_id和prod_code来groupby
    for freq in freq_lis:
        stat_count = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}_freq_b1']
        tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
    for freq in freq_lis:
        stat_count = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}_freq_b']
        tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
    for freq in freq_lis:
        stat_count = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}_freq_b']
        tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
    for freq in freq_lis:
        stat_count = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}_freq_b2']
        tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
    for freq in freq_lis:
        stat_count = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', 'prod_code'])[freq].value_counts().reset_index()
        stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}_freq_2']
        tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
        
    
    
    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)

R点击

r = pd.read_csv(data_path_2 + 'r.csv')
r['date'] = r['r5'].apply(lambda x: x[:7])

dict_  = {'2021-07-01':'2021-06', '2021-08-01':'2021-07', '2021-09-01':'2021-08', '2021-10-01':'2021-09'}
dict_1 = {'2021-07-01':['2021-05', '2021-06'], '2021-08-01':['2021-06', '2021-07'], 
          '2021-09-01':['2021-07', '2021-08'], '2021-10-01':['2021-08', '2021-09']}

dfs = []

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]

    stat_1 = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat_1.columns = ['core_cust_id','uid_click_prod_count_b']
    stat_1['pid_click_nunique_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stst_1['pid_click_mean_count_b']    = stat_1['uid_click_prod_count_b'] / stat_1['pid_click_nunique_b']
    stat_1['uid_click_b']           = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['r1'].agg('count').values
    stat_1['u_r3_nunique_b']  = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['r3'].agg('nunique').values
    
    
    #查看倒数第二个月的，因为缺失9月数据
    stat_2 = r[(r['date'] == dict_1[month][0])].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat_2.columns = ['core_cust_id','uid_click_prod_count_2']
    stat_2['pid_click_nunique_2'] = r[(r['date'] == dict_1[month][0])].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stst_2['pid_mean_click_count_2']    = stat_1['uid_click_prod_count_2'] / stat_1['pid_click_nunique_2']
    stat_2['uid_click_2']         = r[(r['date'] == dict_1[month][0])].groupby('core_cust_id')['r1'].agg('count').values
    stat_2['u_r3_nunique_2']  = r[(r['date'] == dict_1[month][0])].groupby('core_cust_id')['r3'].agg('nunique').values
    
    #倒数第二月之前的
    stat_3 = r[(r['date'] < dict_1[month][0])].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat_3.columns = ['core_cust_id','uid_click_prod_count_b2']
    stat_3['pid_click_nunique_b2'] = r[(r['date'] < dict_1[month][0])].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stst_3['pid_mean_click_count_b2']    = stat_1['uid_click_prod_count_b2'] / stat_1['pid_click_nunique_b2']
    stat_3['uid_click_b2']         = r[(r['date'] < dict_1[month][0])].groupby('core_cust_id')['r1'].agg('count').values
    stat_3['u_r3_nunique_b2']  = r[(r['date'] < dict_1[month][0])].groupby('core_cust_id')['r3'].agg('nunique').values
    
    tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')
    tmp_df = tmp_df.merge(stat_2, on='core_cust_id', how='left')
    tmp_df = tmp_df.merge(stat_3, on='core_cust_id', how='left')
    ###################
    
    #产品的频度
    stat_4 = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])['r1'].count().reset_index()
    stat_4.columns = ['core_cust_id', 'prod_code', 'freq_click_prod_b']
    stat_5 = r[(r['date'] == dict_1[month][0])].groupby(['core_cust_id', 'prod_code'])['r1'].count().reset_index()
    stat_5.columns = ['core_cust_id', 'prod_code', 'freq_click_prod_2']
    stat_6 = r[(r['date'] < dict_1[month][0])].groupby(['core_cust_id', 'prod_code'])['r1'].count().reset_index()
    stat_6.columns = ['core_cust_id', 'prod_code', 'freq_click_prod_b2']
    
    tmp_df = tmp_df.merge(stat_4, on=['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_5, on=['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_6, on=['core_cust_id', 'prod_code'], how='left')
    #点击类型频度
    stat_7 = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'r3'])['r1'].count().reset_index()
    stat_7.columns = ['core_cust_id', 'prod_code', 'freq_click_r3_b']
    stat_8 = r[(r['date'] == dict_1[month][0])].groupby(['core_cust_id', 'r3'])['r1'].count().reset_index()
    stat_8.columns = ['core_cust_id', 'prod_code', 'freq_click_r3_2']
    stat_9 = r[(r['date'] < dict_1[month][0])].groupby(['core_cust_id', 'r3'])['r1'].count().reset_index()
    stat_9.columns = ['core_cust_id', 'prod_code', 'freq_click_r3_b2']
    
    tmp_df = tmp_df.merge(stat_7, on=['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_8, on=['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_9, on=['core_cust_id', 'prod_code'], how='left')
    ###################
    

    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)