流水
更新流水
dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dict_1 = {'2021-07-01':['202105', '202106'], '2021-08-01':['202106', '202107'],
'2021-09-01':['202107', '202108'], '2021-10-01':['202108', '202109']}
data = p_record.copy()
dfs = []
#freq_lis = [['业务代码'], ['渠道标识'], ['资金状态'], ['交易状态'], ['deal_bus'], ['deal_bus_channel'],['deal_bus_fund'], ['deal_bus_c_f']]
#前:一个月、两个月;当月前、一个月前、两个月前,reduce
# redu_b = (data['date'] <= dict_[month])
# redu_b1 = (data['date'] < dict_[month])
# redu_b2 = (data['date'] < dict_1[month][0])
# redu_1 = (data['date'] == dict_[month])
# redu_2 = (data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])
split_month = ['redu_1', 'redu_2', 'redu_b', 'redu_b1', 'redu_b2']
for month in sorted(df['a3'].unique()):
print(month)
tmp_df = df[df['a3'] == month]
#5种不同月份拆分
#111111####################
stat = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat.columns = ['core_cust_id', 'uid_count_1']
stat['pid_nunique_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat['pid_mean_count_1'] = stat[f'uid_count_1'] / stat[f'pid_nunique_1']
stat['apply_mean_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat['apply_max_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat['apply_min_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat['apply_std_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat['apply_median_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat['apply_sum_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat['apply_max_min_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat['uflow_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
#stat[f'pflow_{mont}'] = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat[f'flow_{mont}'] = data[data['date'] == dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#2222222###########
stat1 = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].count().reset_index()
stat1.columns = ['core_cust_id', f'uid_count_2']
stat1['pid_nunique_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat1['pid_mean_count_2'] = stat1['uid_count_2'] / stat1['pid_nunique_2']
stat1['apply_mean_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat1['apply_max_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat1['apply_min_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat1['apply_std_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat1['apply_median_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat1['apply_sum_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat1['apply_max_min_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat1['uflow_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['流水号'].agg('count').values
# stat1[f'pflow_{mont}'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat1[f'flow_{mont}'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#333333333##################################
stat2 = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat2.columns = ['core_cust_id', f'uid_count_b']
stat2['pid_nunique_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat2['pid_mean_count_b'] = stat2['uid_count_b'] / stat2['pid_nunique_b']
stat2['apply_mean_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat2['apply_max_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat2['apply_min_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat2['apply_std_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat2['apply_median_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat2['apply_sum_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat2['apply_max_min_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat2['uflow_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
# stat2[f'pflow_{mont}'] = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat2[f'flow_{mont}'] = data[data['date'] <= dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#4444444444#######################
stat3 = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat3.columns = ['core_cust_id', 'uid_count_b1']
stat3['pid_nunique_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat3['pid_mean_count_{mont}'] = stat3['uid_count_b1'] / stat3['pid_nunique_b1']
stat3['apply_mean_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat3['apply_max_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat3['apply_min_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat3['apply_std_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat3['apply_median_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat3['apply_sum_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat3['apply_max_min_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat3['uflow_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
# stat3[f'pflow_{mont}'] = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat3[f'flow_{mont}'] = data[data['date'] < dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#5555555555555#############################
stat4 = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat4.columns = ['core_cust_id', f'uid_count_b2']
stat4['pid_nunique_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat4['pid_mean_count_b2'] = stat4['uid_count_b2'] / stat4['pid_nunique_b2']
stat4['apply_mean_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat4['apply_max_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat4['apply_min_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat4['apply_std_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat4['apply_median_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat4['apply_sum_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat4['apply_max_min_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat4['uflow_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['流水号'].agg('count').values
# stat4[f'pflow_{mont}'] = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat4[f'flow_{mont}'] = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#以core_cust_id/prod_code来groupby,查看用户在该产品中产生多少笔交易
#########################
stat_5 = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_5.columns = ['core_cust_id', 'prod_code', 'flow_redu_b1']
stat_6 = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_6.columns = ['core_cust_id', 'prod_code', 'flow_redu_b']
stat_7 = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_7.columns = ['core_cust_id', 'prod_code', 'flow_redu_b2']
stat_8 = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_8.columns = ['core_cust_id', 'prod_code', 'flow_redu_1']
stat_9 = data[(data['date'] == dict_[month][0]) & (data['date'] == dict_[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_9.columns = ['core_cust_id', 'prod_code', 'flow_redu_2']
tmp_df = tmp_df.merge(stat, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat1, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat2, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat3, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat4, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat_5, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat_6, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat_7, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat_8, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat_9, on= ['core_cust_id', 'prod_code'], how='left')
#以core_cust_id和freq_lis来groupby
freq_lis = ['业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel','deal_bus_fund', 'deal_bus_c_f']
#频度统计
for freq in freq_lis:
group = ['core_cust_id']
group.append(freq)
stat_count = data[data['date'] < dict_[month]].groupby(group)['流水号'].count().reset_index()
#stat_count = data[data['date'] < dict_[month]].groupby(['core_cust_id'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_b1']
tmp_df = tmp_df.merge(stat_count, on=group, how='left')
for freq in freq_lis:
group = ['core_cust_id']
group.append(freq)
stat_count = data[data['date'] <= dict_[month]].groupby(group)['流水号'].count().reset_index()
stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_b']
tmp_df = tmp_df.merge(stat_count, on=group, how='left')
for freq in freq_lis:
group = ['core_cust_id']
group.append(freq)
stat_count = data[data['date'] == dict_[month]].groupby(group)['流水号'].count().reset_index()
stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_1']
tmp_df = tmp_df.merge(stat_count, on=group, how='left')
for freq in freq_lis:
group = ['core_cust_id']
group.append(freq)
stat_count = data[data['date'] < dict_1[month][0]].groupby(group)['流水号'].count().reset_index()
stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_b2']
tmp_df = tmp_df.merge(stat_count, on=group, how='left')
for freq in freq_lis:
group = ['core_cust_id']
group.append(freq)
stat_count = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(group)['流水号'].count().reset_index()
stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_2']
tmp_df = tmp_df.merge(stat_count, on=group, how='left')
#以core_cust_id和prod_code来groupby
for freq in freq_lis:
group = ['core_cust_id', 'prod_code']
group.append(freq)
stat_count = data[data['date'] < dict_[month]].groupby(group)['流水号'].count().reset_index()
stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'{freq}_freq_b1']
tmp_df = tmp_df.merge(stat_count, on=group, how='left')
for freq in freq_lis:
group = ['core_cust_id', 'prod_code']
group.append(freq)
stat_count = data[data['date'] <= dict_[month]].groupby(group)['流水号'].count().reset_index()
stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'{freq}_freq_b']
tmp_df = tmp_df.merge(stat_count, on=group, how='left')
for freq in freq_lis:
group = ['core_cust_id', 'prod_code']
group.append(freq)
stat_count = data[data['date'] == dict_[month]].groupby(group)['流水号'].count().reset_index()
stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'{freq}_freq_b']
tmp_df = tmp_df.merge(stat_count, on=group, how='left')
for freq in freq_lis:
group = ['core_cust_id', 'prod_code']
group.append(freq)
stat_count = data[data['date'] < dict_1[month][0]].groupby(group)['流水号'].count().reset_index()
stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'{freq}_freq_b2']
tmp_df = tmp_df.merge(stat_count, on=group, how='left')
for freq in freq_lis:
group = ['core_cust_id', 'prod_code']
group.append(freq)
stat_count = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(group)['流水号'].count().reset_index()
stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'{freq}_freq_2']
tmp_df = tmp_df.merge(stat_count, on=group, how='left')
dfs.append(tmp_df)
df = pd.concat(dfs).reset_index(drop=True)
旧的流水
dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dict_1 = {'2021-07-01':['202105', '202106'], '2021-08-01':['202106', '202107'],
'2021-09-01':['202107', '202108'], '2021-10-01':['202108', '202109']}
data = p_record.copy()
dfs = []
freq_lis = ['业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel','deal_bus_fund', 'deal_bus_c_f']
#前:一个月、两个月;当月前、一个月前、两个月前,reduce
# redu_b = (data['date'] <= dict_[month])
# redu_b1 = (data['date'] < dict_[month])
# redu_b2 = (data['date'] < dict_1[month][0])
# redu_1 = (data['date'] == dict_[month])
# redu_2 = (data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])
split_month = ['redu_1', 'redu_2', 'redu_b', 'redu_b1', 'redu_b2']
for month in sorted(df['a3'].unique()):
print(month)
tmp_df = df[df['a3'] == month]
#5种不同月份拆分
#111111####################
stat = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat.columns = ['core_cust_id', 'uid_count_1']
stat['pid_nunique_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat['pid_mean_count_1'] = stat[f'uid_count_1'] / stat[f'pid_nunique_1']
stat['apply_mean_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat['apply_max_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat['apply_min_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat['apply_std_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat['apply_median_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat['apply_sum_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat['apply_max_min_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat['uflow_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
#stat[f'pflow_{mont}'] = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat[f'flow_{mont}'] = data[data['date'] == dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#2222222###########
stat1 = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].count().reset_index()
stat1.columns = ['core_cust_id', f'uid_count_2']
stat1['pid_nunique_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat1['pid_mean_count_2'] = stat1['uid_count_2'] / stat1['pid_nunique_2']
stat1['apply_mean_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat1['apply_max_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat1['apply_min_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat1['apply_std_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat1['apply_median_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat1['apply_sum_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat1['apply_max_min_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat1['uflow_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['流水号'].agg('count').values
# stat1[f'pflow_{mont}'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat1[f'flow_{mont}'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#333333333##################################
stat2 = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat2.columns = ['core_cust_id', f'uid_count_b']
stat2['pid_nunique_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat2['pid_mean_count_b'] = stat2['uid_count_b'] / stat2['pid_nunique_b']
stat2['apply_mean_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat2['apply_max_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat2['apply_min_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat2['apply_std_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat2['apply_median_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat2['apply_sum_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat2['apply_max_min_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat2['uflow_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
# stat2[f'pflow_{mont}'] = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat2[f'flow_{mont}'] = data[data['date'] <= dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#4444444444#######################
stat3 = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat3.columns = ['core_cust_id', 'uid_count_b1']
stat3['pid_nunique_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat3['pid_mean_count_{mont}'] = stat3['uid_count_b1'] / stat3['pid_nunique_b1']
stat3['apply_mean_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat3['apply_max_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat3['apply_min_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat3['apply_std_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat3['apply_median_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat3['apply_sum_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat3['apply_max_min_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat3['uflow_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
# stat3[f'pflow_{mont}'] = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat3[f'flow_{mont}'] = data[data['date'] < dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#5555555555555#############################
stat4 = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat4.columns = ['core_cust_id', f'uid_count_b2']
stat4['pid_nunique_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat4['pid_mean_count_b2'] = stat4['uid_count_b2'] / stat4['pid_nunique_b2']
stat4['apply_mean_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat4['apply_max_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat4['apply_min_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat4['apply_std_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat4['apply_median_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat4['apply_sum_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat4['apply_max_min_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat4['uflow_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['流水号'].agg('count').values
# stat4[f'pflow_{mont}'] = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat4[f'flow_{mont}'] = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#以core_cust_id/prod_code来groupby,查看用户在该产品中产生多少笔交易
#########################
stat_5 = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_5.columns = ['core_cust_id', 'prod_code', 'flow_redu_b1']
stat_6 = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_6.columns = ['core_cust_id', 'prod_code', 'flow_redu_b']
stat_7 = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_7.columns = ['core_cust_id', 'prod_code', 'flow_redu_b2']
stat_8 = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_8.columns = ['core_cust_id', 'prod_code', 'flow_redu_1']
stat_9 = data[(data['date'] == dict_[month][0]) & (data['date'] == dict_[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_9.columns = ['core_cust_id', 'prod_code', 'flow_redu_2']
tmp_df = tmp_df.merge(stat, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat1, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat2, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat3, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat4, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat5, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat6, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat7, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat8, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat9, on= ['core_cust_id', 'prod_code'], how='left')
#以core_cust_id和freq_lis来groupby
#freq_lis = ['业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel','deal_bus_fund', 'deal_bus_c_f']
#频度统计
for freq in freq_lis:
stat_count = data[data['date'] < dict_[month]].groupby(['core_cust_id'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', f'{freq}_freq_b1']
tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
for freq in freq_lis:
stat_count = data[data['date'] <= dict_[month]].groupby(['core_cust_id'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', f'{freq}_freq_b']
tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
for freq in freq_lis:
stat_count = data[data['date'] == dict_[month]].groupby(['core_cust_id'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', f'{freq}_freq_1']
tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
for freq in freq_lis:
stat_count = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', f'{freq}_freq_b2']
tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
for freq in freq_lis:
stat_count = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', f'{freq}_freq_2']
tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
#以core_cust_id和prod_code来groupby
for freq in freq_lis:
stat_count = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}_freq_b1']
tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
for freq in freq_lis:
stat_count = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}_freq_b']
tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
for freq in freq_lis:
stat_count = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}_freq_b']
tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
for freq in freq_lis:
stat_count = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}_freq_b2']
tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
for freq in freq_lis:
stat_count = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', 'prod_code'])[freq].value_counts().reset_index()
stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}_freq_2']
tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
dfs.append(tmp_df)
df = pd.concat(dfs).reset_index(drop=True)
R点击
r = pd.read_csv(data_path_2 + 'r.csv')
r['date'] = r['r5'].apply(lambda x: x[:7])
dict_ = {'2021-07-01':'2021-06', '2021-08-01':'2021-07', '2021-09-01':'2021-08', '2021-10-01':'2021-09'}
dict_1 = {'2021-07-01':['2021-05', '2021-06'], '2021-08-01':['2021-06', '2021-07'],
'2021-09-01':['2021-07', '2021-08'], '2021-10-01':['2021-08', '2021-09']}
dfs = []
for month in sorted(df['a3'].unique()):
print(month)
tmp_df = df[df['a3'] == month]
stat_1 = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].count().reset_index()
stat_1.columns = ['core_cust_id','uid_click_prod_count_b']
stat_1['pid_click_nunique_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].agg('nunique').values
stst_1['pid_click_mean_count_b'] = stat_1['uid_click_prod_count_b'] / stat_1['pid_click_nunique_b']
stat_1['uid_click_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['r1'].agg('count').values
stat_1['u_r3_nunique_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['r3'].agg('nunique').values
#查看倒数第二个月的,因为缺失9月数据
stat_2 = r[(r['date'] == dict_1[month][0])].groupby('core_cust_id')['prod_code'].count().reset_index()
stat_2.columns = ['core_cust_id','uid_click_prod_count_2']
stat_2['pid_click_nunique_2'] = r[(r['date'] == dict_1[month][0])].groupby('core_cust_id')['prod_code'].agg('nunique').values
stst_2['pid_mean_click_count_2'] = stat_1['uid_click_prod_count_2'] / stat_1['pid_click_nunique_2']
stat_2['uid_click_2'] = r[(r['date'] == dict_1[month][0])].groupby('core_cust_id')['r1'].agg('count').values
stat_2['u_r3_nunique_2'] = r[(r['date'] == dict_1[month][0])].groupby('core_cust_id')['r3'].agg('nunique').values
#倒数第二月之前的
stat_3 = r[(r['date'] < dict_1[month][0])].groupby('core_cust_id')['prod_code'].count().reset_index()
stat_3.columns = ['core_cust_id','uid_click_prod_count_b2']
stat_3['pid_click_nunique_b2'] = r[(r['date'] < dict_1[month][0])].groupby('core_cust_id')['prod_code'].agg('nunique').values
stst_3['pid_mean_click_count_b2'] = stat_1['uid_click_prod_count_b2'] / stat_1['pid_click_nunique_b2']
stat_3['uid_click_b2'] = r[(r['date'] < dict_1[month][0])].groupby('core_cust_id')['r1'].agg('count').values
stat_3['u_r3_nunique_b2'] = r[(r['date'] < dict_1[month][0])].groupby('core_cust_id')['r3'].agg('nunique').values
tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')
tmp_df = tmp_df.merge(stat_2, on='core_cust_id', how='left')
tmp_df = tmp_df.merge(stat_3, on='core_cust_id', how='left')
###################
#产品的频度
stat_4 = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])['r1'].count().reset_index()
stat_4.columns = ['core_cust_id', 'prod_code', 'freq_click_prod_b']
stat_5 = r[(r['date'] == dict_1[month][0])].groupby(['core_cust_id', 'prod_code'])['r1'].count().reset_index()
stat_5.columns = ['core_cust_id', 'prod_code', 'freq_click_prod_2']
stat_6 = r[(r['date'] < dict_1[month][0])].groupby(['core_cust_id', 'prod_code'])['r1'].count().reset_index()
stat_6.columns = ['core_cust_id', 'prod_code', 'freq_click_prod_b2']
tmp_df = tmp_df.merge(stat_4, on=['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat_5, on=['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat_6, on=['core_cust_id', 'prod_code'], how='left')
#点击类型频度
stat_7 = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'r3'])['r1'].count().reset_index()
stat_7.columns = ['core_cust_id', 'prod_code', 'freq_click_r3_b']
stat_8 = r[(r['date'] == dict_1[month][0])].groupby(['core_cust_id', 'r3'])['r1'].count().reset_index()
stat_8.columns = ['core_cust_id', 'prod_code', 'freq_click_r3_2']
stat_9 = r[(r['date'] < dict_1[month][0])].groupby(['core_cust_id', 'r3'])['r1'].count().reset_index()
stat_9.columns = ['core_cust_id', 'prod_code', 'freq_click_r3_b2']
tmp_df = tmp_df.merge(stat_7, on=['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat_8, on=['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat_9, on=['core_cust_id', 'prod_code'], how='left')
###################
dfs.append(tmp_df)
df = pd.concat(dfs).reset_index(drop=True)