产品
#################################
dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dict_1 = {'2021-07-01':['202105', '202106'], '2021-08-01':['202106', '202107'],
'2021-09-01':['202107', '202108'], '2021-10-01':['202108', '202109']}
data = p_record.copy()
dfs = []
freq_lis = ['业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel','deal_bus_fund', 'deal_bus_c_f']
#前:一个月、两个月;当月前、一个月前、两个月前,reduce
# redu_b = (data['date'] <= dict_[month])
# redu_b1 = (data['date'] < dict_[month])
# redu_b2 = (data['date'] < dict_1[month][0])
# redu_1 = (data['date'] == dict_[month])
# redu_2 = (data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])
split_month = ['redu_1', 'redu_2', 'redu_b', 'redu_b1', 'redu_b2']
for month in sorted(df['a3'].unique()):
print(month)
tmp_df = df[df['a3'] == month]
#5种不同月份拆分
#111111####################
stat = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat.columns = ['core_cust_id', 'uid_count_1']
stat['pid_nunique_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat['pid_mean_count_1'] = stat[f'uid_count_1'] / stat[f'pid_nunique_1']
stat['apply_mean_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat['apply_max_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat['apply_min_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat['apply_std_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat['apply_median_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat['apply_sum_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat['apply_max_min_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat['uflow_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
#stat[f'pflow_{mont}'] = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat[f'flow_{mont}'] = data[data['date'] == dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#2222222###########
stat1 = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].count().reset_index()
stat1.columns = ['core_cust_id', f'uid_count_2']
stat1['pid_nunique_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat1['pid_mean_count_2'] = stat1['uid_count_2'] / stat1['pid_nunique_2']
stat1['apply_mean_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat1['apply_max_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat1['apply_min_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat1['apply_std_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat1['apply_median_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat1['apply_sum_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat1['apply_max_min_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat1['uflow_{mont}'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['流水号'].agg('count').values
# stat1[f'pflow_{mont}'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat1[f'flow_{mont}'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#333333333##################################
stat2 = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat2.columns = ['core_cust_id', f'uid_count_b']
stat2['pid_nunique_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat2['pid_mean_count_b'] = stat2['uid_count_b'] / stat2['pid_nunique_b']
stat2['apply_mean_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat2['apply_max_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat2['apply_min_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat2['apply_std_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat2['apply_median_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat2['apply_sum_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat2['apply_max_min_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat2['uflow_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
# stat2[f'pflow_{mont}'] = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat2[f'flow_{mont}'] = data[data['date'] <= dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#4444444444#######################
stat3 = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat3.columns = ['core_cust_id', 'uid_count_b1']
stat3['pid_nunique_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat3['pid_mean_count_{mont}'] = stat3['uid_count_b1'] / stat3['pid_nunique_b1']
stat3['apply_mean_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat3['apply_max_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat3['apply_min_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat3['apply_std_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat3['apply_median_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat3['apply_sum_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat3['apply_max_min_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat3['uflow_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
# stat3[f'pflow_{mont}'] = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat3[f'flow_{mont}'] = data[data['date'] < dict_[month]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#5555555555555#############################
stat4 = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].count().reset_index()
stat4.columns = ['core_cust_id', f'uid_count_b2']
stat4['pid_nunique_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat4['pid_mean_count_b2'] = stat4['uid_count_b2'] / stat4['pid_nunique_b2']
stat4['apply_mean_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat4['apply_max_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat4['apply_min_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat4['apply_std_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat4['apply_median_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat4['apply_sum_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat4['apply_max_min_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat4['uflow_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['流水号'].agg('count').values
# stat4[f'pflow_{mont}'] = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
# for freq in freq_lis:
# stat4[f'flow_{mont}'] = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', freq])['流水号'].agg('count').values
#########################
stat_5 = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_5.columns = ['core_cust_id', 'prod_code', 'flow_redu_b1']
stat_6 = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_6.columns = ['core_cust_id', 'prod_code', 'flow_redu_b']
stat_7 = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_7.columns = ['core_cust_id', 'prod_code', 'flow_redu_b2']
stat_8 = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_8.columns = ['core_cust_id', 'prod_code', 'flow_redu_1']
stat_9 = data[(data['date'] == dict_[month][0]) & (data['date'] == dict_[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
stat_9.columns = ['core_cust_id', 'prod_code', 'flow_redu_2']
tmp_df = tmp_df.merge(stat, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat1, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat2, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat3, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat4, on= ['core_cust_id'], how='left')
tmp_df = tmp_df.merge(stat5, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat6, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat7, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat8, on= ['core_cust_id', 'prod_code'], how='left')
tmp_df = tmp_df.merge(stat9, on= ['core_cust_id', 'prod_code'], how='left')
dfs.append(tmp_df)
df = pd.concat(dfs).reset_index(drop=True)
加(#############)是更改过的
product = pd.read_pickle('product.pkl')
#产品的再次分类
product['prod_4'] = product['prod_code'].apply(lambda x:x[:4])
product['prod_3'] = product['prod_code'].apply(lambda x:x[:3])
product['prod_1'] = product['prod_code'].apply(lambda x:x[:1])
#收益风险比
product['收益风险比'] = product['预期收益率'] / product['风险等级']
#持有天数和收益率关系
product['天数收益比'] = product['预期收益率'] / product['持有天数']
###############################################
product.rename(columns={'prod_class':'a2'}, inplace=True)
#加category特征频度统计,
#product_counts = product.groupby(['a2'])['计价类型'].value_counts().reset_index()
count_lis_p = ['计价类型', '周期类型', '模式', '风险等级', '是否允许变更分红方式', '管理方式', '业务模式', '收益特点', '期限', '投资模式', '展示等级']
for fea in count_lis_p:
product[f'{fea}_count'] = product.groupby(['a2', fea])['prod_code'].transform(lambda x:x.count()).values.tolist()
#product['计价类型_count'] = product.groupby(['a2', '计价类型'])['prod_code'].transform(lambda x:x.count()).values.tolist()
###############################################
com_pid = set(product['prod_code'].unique()).intersection(set(df['prod_code'].unique())) # 求交集
tmp = product[product['prod_code'].isin(com_pid)].describe().T
useful_cols = list(tmp[(tmp['std'] != 0) & (tmp['std'] != np.nan)].index) # 判断标准差
useful_cols = [c for c in useful_cols if c not in ['数据日期']]
print(useful_cols)
if len(useful_cols) > 0:
#########################################################
df = df.merge(product[['prod_code']+useful_cols], on=['prod_code', 'a2'], how='left')
产品流水
p_record = pd.read_pickle('prod_record.pkl')
#日期的年月
p_record['date'] = p_record['trade_date'].apply(lambda x: str(x)[:6])
#申请金额apply_amont
p_record.rename(columns={'申请金额': 'apply_amt'}, inplace=True)
#交易状态 + 资金状态 + 业务代码 + 渠道标识 排序组合
p_record['deal_bus'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str')
p_record['deal_bus_channel'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str') + p_record['渠道标识'].astype('str')
p_record['deal_bus_fund'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str') + p_record['资金状态'].astype('str')
p_record['deal_bus_c_f'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str') + p_record['渠道标识'].astype('str') + p_record['资金状态'].astype('str')
#p_record['deal_bus'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str')
#整个月的平均值与最大值或最小值相比,如果等,那就是不买,如果不等,那就购买
p_record.rename(columns={'trade_date':'a3'},inplace = True)
p_record.rename(columns={'prod_class':'a2'},inplace = True)
df = df.merge(p_record, on= ['core_cust_id','prod_code','a3','a2'], how='left')
def diff_max_min(x):
return x.max() - x.min()
dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dict_1 = {'2021-07-01':['202105', '202106'], '2021-08-01':['202106', '202107'],
'2021-09-01':['202107', '202108'], '2021-10-01':['202108', '202109']}
data = p_record.copy()
dfs = []
freq_lis = [业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel','deal_bus_fund', 'deal_bus_c_f']
#前:一个月、两个月;当月前、一个月前、两个月前,reduce
redu_b = (data['date'] <= dict_[month])
redu_b1 = (data['date'] < dict_[month])
redu_b2 = (data['date'] < dict_1[month][0])
redu_1 = (data['date'] == dict_[month])
redu_2 = (data['date'] == dict_1[month])
split_month = [reduce_1, reduce_2, reduce_b, reduce_b1, reduce_b2]
for month in sorted(df['a3'].unique()):
print(month)
tmp_df = df[df['a3'] == month]
#5种不同月份拆分
for i, month in enumerate(split_month):
con = split_month[i]
stat = data[con].groupby('core_cust_id')['prod_code'].count().reset_index()
stat.columns = ['core_cust_id', f'uid_count_{month}']
stat[f'pid_nunique_{month}'] = data[con].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat[f'pid_mean_count_{month}'] = stat_1[f'uid_count{month}'] / stat_1[f'pid_nunique{month}']
stat[f'apply_mean_{month}'] = data[con].groupby('core_cust_id')['apply_amt'].agg('mean').values
stat[f'apply_max_{month}'] = data[con].groupby('core_cust_id')['apply_amt'].agg('max').values # 我们加的
stat[f'apply_min_{month}'] = data[con].groupby('core_cust_id')['apply_amt'].agg('min').values # 我们加的
stat[f'apply_std_{month}'] = data[con].groupby('core_cust_id')['apply_amt'].agg('std').values # 我们加的
stat[f'apply_median_{month}'] = data[con].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
stat[f'apply_sum_{month}'] = data[con].groupby('core_cust_id')['apply_amt'].agg('sum').values
stat[f'apply_max_min{month}'] = data[con].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#频度统计
stat[f'uflow_{month}'] = data[con].groupby('core_cust_id')['流水号'].agg('count').values
stat[f'pflow_{month}'] = data[con].groupby(['core_cust_id', 'prod_code'])['流水号'].agg('count').values
for freq in freq_lis:
stat[f'flow_{month}'] = data[con].groupby(['core_cust_id', freq])['流水号'].agg('count').values
tmp_df = tmp_df.merge(stat, on= ['core_cust_id','prod_code'], how='left')
dfs.append(tmp_df)
df = pd.concat(dfs).reset_index(drop=True)
R点击表
r = pd.read_csv(data_path_2 + 'r.csv')
r['date'] = r['r5'].apply(lambda x: x[:7])
dict_ = {'2021-07-01':'2021-06', '2021-08-01':'2021-07', '2021-09-01':'2021-08', '2021-10-01':'2021-09'}
dict_1 = {'2021-07-01':['2021-05', '2021-06'], '2021-08-01':['2021-06', '2021-07'],
'2021-09-01':['2021-07', '2021-08'], '2021-10-01':['2021-08', '2021-09']}
dfs = []
for month in sorted(df['a3'].unique()):
print(month)
tmp_df = df[df['a3'] == month]
stat_1 = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].count().reset_index()
stat_1.columns = ['core_cust_id','uid_click_prod_count']
stat_1['pid_click_nunique_all'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].agg('nunique').values
stat_1['uid_click_all'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['r1'].agg('count').values
###################
groupby(id), 频度统计、统计量
groupby(id, prod), 频度统计、统计量
###################
tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')
dfs.append(tmp_df)
df = pd.concat(dfs).reset_index(drop=True)