#0.8055
import pandas as p
import numpy as n
import xgboost as xgb
def prepr(raw):
pre=raw.copy()
pre['num']=1
#折扣率
pre['MJ']=list(map(lambda x:1 if':'in str(x) else 0 ,pre['Discount_rate']))
pre['JIAN']=list(map(lambda x:int(str(x).split(':')[1]) if":"in repr(x) else 0,pre['Discount_rate']))
pre['MI_COST']=list(map(lambda x:int(str(x).split(':')[0]) if":"in repr(x) else 0,pre['Discount_rate']))
pre['DISCOUNT']=list(map(lambda x:(float(str(x).split(':')[0])-float(str(x).split(':')[1]))/float(str(x).split(':')[0]) if":"in repr(x) else float(x),pre['Discount_rate']))
#距离
pre['Distance'].fillna(-1,inplace=True)
pre['NUII_DISTANCE']=pre['Distance'].map(lambda x:1 if x == -1 else 0)
#时间
pre['DATE_RECEIVED']=p.to_datetime(pre['Date_received'],format='%Y%m%d')
if'Date' in pre.columns.tolist():
pre['DATE']=p.to_datetime(pre['Date'],format='%Y%m%d')
pre['label'] = list(map(lambda y,x: 1 if (y-x).total_seconds()/(24*3600) <= 15 else 0,pre['DATE_RECEIVED'],pre['DATE']))
pre['label'] = pre['label'].map(int)
return pre
def construct_data(history,label):
label_f=get_label_f(label)
history_f=get_history_f(history,label)
#构造数据集
commom=list(set(label_f.columns.tolist())&set(history_f.columns.tolist()))
data=p.concat([label_f, history_f.drop(commom, axis=1)],axis=1)
#去重
data.drop_duplicates(subset=None,keep='last',inplace=True)
data.index=range(len(data))
return data
def get_history_f(history,label):
data=history.copy()
data['Coupon_id']=data['Coupon_id'].map(int)
data['Date_received']=data['Date_received'].map(int)
h_f=label.copy()
########################### 用户
keys=['User_id']
prefixs='history_field_'+'_'.join(keys)+'_'
#用户领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
#用户 领满减数
pivot=p.DataFrame(data[data['MJ']==1].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'MJ'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户领满减率
h_f[prefixs + 'lu_MJ'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'MJ'],h_f[prefixs + 'received']))
#用户15天内核销最大折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = max)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_max'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#用户15天内核销最小折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = min)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_min'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#用户15天内核销平均折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = n.mean)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_aver'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#用户15天内核销中位折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = n.median)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_median'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#用户15天内核销的最大距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = max)).rename(columns = {'Distance':prefixs + 'Distance_15_max'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#用户15天内核销的最小距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = min)).rename(columns = {'Distance':prefixs + 'Distance_15_min'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#用户15天内核销的平均距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = n.mean)).rename(columns = {'Distance':prefixs + 'Distance_15_mean'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#用户15天内核销的中位距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = n.median)).rename(columns = {'Distance':prefixs + 'Distance_15_median'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#用户15天内核销满减券减额最大值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = max)).rename(columns = {'JIAN': prefixs + "JIAN_max"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户15天内核销满减券减额最小值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = min)).rename(columns = {'JIAN': prefixs + "JIAN_min"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户15天内核销满减券减额平均值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.mean)).rename(columns = {'JIAN': prefixs + "JIAN_aver"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户15天内核销满减券减额中位值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.median)).rename(columns = {'JIAN': prefixs + "JIAN_median"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户15天内核销满减券最低消费最大值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = max)).rename(columns = {'MI_COST': prefixs + 'MI_COST_max'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户15天内核销满减券最低消费最小值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = min)).rename(columns = {'MI_COST': prefixs + 'MI_COST_min'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户15天内核销满减券最低消费平均值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = n.mean)).rename(columns = {'MI_COST': prefixs + 'MI_COST_aver'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户15天内核销满减券最低消费中位值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = n.median)).rename(columns = {'MI_COST': prefixs + 'MI_COST_medain'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
################################# 用户+商家
keys=['User_id','Merchant_id']
prefixs='history_field_'+'_'.join(keys)+'_'
#用户+商家领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+商家核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+商家核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
#用户+商家15天内核销最大折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = max)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_max'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#用户+商家15天内核销最小折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = min)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_min'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#用户+商家15天内核销平均折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = n.mean)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_aver'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#用户+商家15天内核销中位折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = n.median)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_median'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#用户+商家15天内核销满减券减额最大值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = max)).rename(columns = {'JIAN': prefixs + "JIAN_max"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户+商家15天内核销满减券减额最小值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = min)).rename(columns = {'JIAN': prefixs + "JIAN_min"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户+商家15天内核销满减券减额平均值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.mean)).rename(columns = {'JIAN': prefixs + "JIAN_aver"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户+商家15天内核销满减券减额中位值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.median)).rename(columns = {'JIAN': prefixs + "JIAN_median"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户+商家15天内核销满减券最低消费最大值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = max)).rename(columns = {'MI_COST': prefixs + 'MI_COST_max'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户+商家15天内核销满减券最低消费最小值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = min)).rename(columns = {'MI_COST': prefixs + 'MI_COST_min'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户+商家15天内核销满减券最低消费平均值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = n.mean)).rename(columns = {'MI_COST': prefixs + 'MI_COST_aver'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#用户+商家15天内核销满减券最低消费中位值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = n.median)).rename(columns = {'MI_COST': prefixs + 'MI_COST_medain'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
################################# 用户+优惠券
keys=['User_id','Coupon_id']
prefixs='history_field_'+'_'.join(keys)+'_'
#用户+优惠券领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+优惠券核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+优惠券核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 用户+折扣率
keys=['User_id','DISCOUNT']
prefixs='history_field_'+'_'.join(keys)+'_'
#用户+折扣率 领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+折扣率 核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x:str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+折扣率 核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 用户+日期
keys=['User_id','DATE_RECEIVED']
prefixs='history_field_'+'_'.join(keys)+'_'
#用户+日期领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+日期核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+日期核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 用户+商家+优惠券
keys=['User_id','Merchant_id','Coupon_id']
prefixs='history_field_'+'_'.join(keys)+'_'
#用户+商家+优惠券 领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+商家+优惠券 核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+商家+优惠券 核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 用户+商家+日期
keys=['User_id','Merchant_id','DATE_RECEIVED']
prefixs='history_field_'+'_'.join(keys)+'_'
# 用户+商家+日期领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
# 用户+商家+日期 核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
# 用户+商家+日期 核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 用户+优惠券+日期
keys=['User_id','Coupon_id','DATE_RECEIVED']
prefixs='history_field_'+'_'.join(keys)+'_'
#用户+优惠券+日期 领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+优惠券+日期 核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#用户+优惠券+日期 核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 商家
keys=['Merchant_id']
prefixs='history_field_'+'_'.join(keys)+'_'
#商家领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#商家核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#商家核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
#商家15天内核销的最大距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = max)).rename(columns = {'Distance':prefixs + 'Distance_15_max'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#商家15天内核销的最小距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = min)).rename(columns = {'Distance':prefixs + 'Distance_15_min'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#商家15天内核销的平均距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = n.mean)).rename(columns = {'Distance':prefixs + 'Distance_15_aver'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#商家15天内核销的中位距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = n.median)).rename(columns = {'Distance':prefixs + 'Distance_15_median'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#商家15天内核销最大折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = max)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_max'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#商家15天内核销最小折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = min)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_min'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#商家15天内核销平均折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = n.mean)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_aver'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#商家15天内核销中位折扣率
pivot = p.DataFrame(data[data['label']==1].pivot_table( index = keys, values ='DISCOUNT',aggfunc = n.median)).rename(columns = {'DISCOUNT':prefixs + 'DISCOUNT_15_median'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how ='left')
#商家15天内核销满减券减额最大值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = max)).rename(columns = {'JIAN': prefixs + "JIAN_max"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#商家15天内核销满减券减额最小值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = min)).rename(columns = {'JIAN': prefixs + "JIAN_min"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#商家15天内核销满减券减额平均值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.mean)).rename(columns = {'JIAN': prefixs + "JIAN_aver"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#商家15天内核销满减券减额中位值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.median)).rename(columns = {'JIAN': prefixs + "JIAN_median"}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#商家15天内核销满减券最低消费最大值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = max)).rename(columns = {'MI_COST': prefixs + 'MI_COST_max'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#商家15天内核销满减券最低消费最小值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = min)).rename(columns = {'MI_COST': prefixs + 'MI_COST_min'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#商家15天内核销满减券最低消费平均值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = n.mean)).rename(columns = {'MI_COST': prefixs + 'MI_COST_aver'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
#商家15天内核销满减券最低消费中位值
pivot = p.DataFrame(data[data['label']==1].pivot_table(index = keys, values = 'MI_COST', aggfunc = n.median)).rename(columns = {'MI_COST': prefixs + 'MI_COST_medain'}).reset_index()
h_f = p.merge(h_f, pivot, on = keys, how = 'left')
################################# 商家+优惠券
keys=['Merchant_id','Coupon_id']
prefixs='history_field_'+'_'.join(keys)+'_'
#商家+优惠券领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#商家+优惠券核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x:str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#商家+优惠券核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 商家+折扣率
keys=['Merchant_id','DISCOUNT']
prefixs='history_field_'+'_'.join(keys)+'_'
#商家+折扣率领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#商家+折扣率核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x:str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#商家+折扣率核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 商家+日期
keys=['Merchant_id','DATE_RECEIVED']
prefixs='history_field_'+'_'.join(keys)+'_'
#商家+日期领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#商家+日期核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x:str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#商家+日期核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 商家+优惠券+日期
keys=['Merchant_id','Coupon_id','DATE_RECEIVED']
prefixs='history_field_'+'_'.join(keys)+'_'
#商家+优惠券+日期 领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#商家+优惠券+日期核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x:str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#商家+优惠券+日期核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 优惠券
keys=['Coupon_id']
prefixs='history_field_'+'_'.join(keys)+'_'
#优惠券领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#优惠券核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#优惠券核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
#优惠券15天内核销的最大距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = max)).rename(columns = {'Distance':prefixs + 'Distance_15_max'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#优惠券15天内核销的最小距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = min)).rename(columns = {'Distance':prefixs + 'Distance_15_min'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#优惠券15天内核销的平均距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = n.mean)).rename(columns = {'Distance':prefixs + 'Distance_15_mean'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#优惠券15天内核销的中位距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = n.median)).rename(columns = {'Distance':prefixs + 'Distance_15_median'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
################################# 优惠券+日期
keys=['Coupon_id','DATE_RECEIVED']
prefixs='history_field_'+'_'.join(keys)+'_'
#优惠券+日期领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#优惠券+日期核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#优惠券+日期核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
################################# 折扣率
keys=['DISCOUNT']
prefixs='history_field_'+'_'.join(keys)+'_'
#折扣率领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#折扣率核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x:str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#折扣率核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
#################################日期
keys=['DATE_RECEIVED']
prefixs='history_field_'+'_'.join(keys)+'_'
#当日领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#当日核销数
pivot=p.DataFrame(data[data['Date'].map(lambda x : str(x) !='nan')].pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received_use'}).reset_index()
h_f=p.merge(h_f,pivot,on=keys,how='left')
#当日核销率
h_f[prefixs + 'lu_use'] = list(map(lambda x,y: x/y if y!=0 else 0 ,h_f[prefixs + 'received_use'],h_f[prefixs + 'received']))
#当日15天内核销的最大距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = max)).rename(columns = {'Distance':prefixs + 'Distance_15_max'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#当日15天内核销的最小距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = min)).rename(columns = {'Distance':prefixs + 'Distance_15_min'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#当日15天内核销的平均距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = n.mean)).rename(columns = {'Distance':prefixs + 'Distance_15_mean'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#当日15天内核销的中位距离
pivot =p.DataFrame(data[data['label']==1].pivot_table( index = keys, values = 'Distance',aggfunc = n.median)).rename(columns = {'Distance':prefixs + 'Distance_15_median'}).reset_index()
h_f = p.merge(h_f, pivot, how ='left', on = keys)
#用户距离正反排序
h_f['label_User_distance_true_rank']=h_f.groupby('User_id')['Distance'].rank(ascending=True)
h_f['label_User_distance_False_rank']=h_f.groupby('User_id')['Distance'].rank(ascending=False)
#用户折扣正反排序
h_f['label_User_discount_rate_true_rank']=h_f.groupby('User_id')['DISCOUNT'].rank(ascending=True)
h_f['label_User_discount_rate_False_rank']=h_f.groupby('User_id')['DISCOUNT'].rank(ascending=False)
#用户领券日期正反排序
h_f['label_User_date_received_true_rank']=h_f.groupby('User_id')['DATE_RECEIVED'].rank(ascending=True)
h_f['label_User_date_received_False_rank']=h_f.groupby('User_id')['DATE_RECEIVED'].rank(ascending=False)
####
#商家距离正反排序
h_f['label_Merchant_distance_true_rank']=h_f.groupby('Merchant_id')['Distance'].rank(ascending=True)
h_f['label_Merchant_distance_False_rank']=h_f.groupby('Merchant_id')['Distance'].rank(ascending=False)
#商家折扣正反排序
h_f['label_Merchant_discount_rate_true_rank']=h_f.groupby('Merchant_id')['DISCOUNT'].rank(ascending=True)
h_f['label_Merchant_discount_rate_False_rank']=h_f.groupby('Merchant_id')['DISCOUNT'].rank(ascending=False)
#商家领券日期正反排序
h_f['label_Merchant_date_received_true_rank']=h_f.groupby('Merchant_id')['DATE_RECEIVED'].rank(ascending=True)
h_f['label_Merchant_date_received_False_rank']=h_f.groupby('Merchant_id')['DATE_RECEIVED'].rank(ascending=False)
#####
############################################~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#优惠券距离正反排序
h_f['label_Coupon_distance_true_rank']=h_f.groupby('Coupon_id')['Distance'].rank(ascending=True)
h_f['label_Coupon_distance_False_rank']=h_f.groupby('Coupon_id')['Distance'].rank(ascending=False)
#优惠券折扣正反排序
h_f['label_Coupon_discount_rate_true_rank']=h_f.groupby('Coupon_id')['DISCOUNT'].rank(ascending=True)
h_f['label_Coupon_discount_rate_False_rank']=h_f.groupby('Coupon_id')['DISCOUNT'].rank(ascending=False)
#优惠券领券日期正反排序
h_f['label_Coupon_date_received_true_rank']=h_f.groupby('Coupon_id')['DATE_RECEIVED'].rank(ascending=True)
h_f['label_Coupon_date_received_False_rank']=h_f.groupby('Coupon_id')['DATE_RECEIVED'].rank(ascending=False)
h_f.fillna(0,downcast='infer',inplace=True)
return h_f
def get_label_f(label):
data=label.copy()
data['Coupon_id']=data['Coupon_id'].map(int)
data['Date_received']=data['Date_received'].map(int)
l_f= label.copy()
###################################用户
keys=['User_id']
prefixs='label_field_'+'_'.join(keys)+'_'
#每个用户领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#用户领券的最大距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = max)).rename(columns = {'Distance': prefixs + "Distance_max"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领券的最小距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = min)).rename(columns = {'Distance': prefixs + "Distance_min"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领券的最小距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = n.mean)).rename(columns = {'Distance': prefixs + "Distance_aver"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领券的最小距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = n.median)).rename(columns = {'Distance': prefixs + "Distance_median"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户优惠券折扣率最大值
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = max)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_max'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户优惠券折扣率最小值
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = min)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_min'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户优惠券折扣率平均数
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = n.mean)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_mean'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户优惠券折扣率中位数
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = n.median)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_median'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领满减券最低消费最大值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=max)).rename(columns = {'MI_COST': prefixs + 'MI_COST_max'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领满减券最低消费最小值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=min)).rename(columns = {'MI_COST': prefixs + 'MI_COST_min'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领满减券最低消费平均数
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=n.mean)).rename(columns = {'MI_COST': prefixs + 'MI_COST_aver'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领满减券最低消费中位数
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=n.median)).rename(columns = {'MI_COST': prefixs + 'MI_COST_median'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领满减券减额最大值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = max)).rename(columns = {'JIAN': prefixs + "JIAN_max"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领满减券减额最小值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = min)).rename(columns = {'JIAN': prefixs + "JIAN_min"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领满减券减额平均值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.mean)).rename(columns = {'JIAN': prefixs + "JIAN_aver"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户领满减券减额中位数
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.median)).rename(columns = {'JIAN': prefixs + "JIAN_median"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
tmp = data[keys+['DATE_RECEIVED']].sort_values(['DATE_RECEIVED'],ascending=True)
#用户第一次领券
first = tmp.drop_duplicates(keys,keep="first")
first[prefixs+"is_first_received"]=1
l_f = p.merge(l_f,first,on=keys+['DATE_RECEIVED'],how="left")
#用户最后一次领券
last = tmp.drop_duplicates(keys,keep="last")
last[prefixs+"is_last_received"] = 1
l_f = p.merge(l_f,last,on=keys+['DATE_RECEIVED'],how="left")
#################################用户+商家
keys=['User_id','Merchant_id']
prefixs='label_field_'+'_'.join(keys)+'_'
#用户+商家领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#用户+商家优惠券折扣率最大值
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = max)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_max'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家优惠券折扣率最小值
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = min)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_min'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家优惠券折扣率平均数
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = n.mean)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_mean'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家优惠券折扣率中位数
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = n.median)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_median'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家领满减券最低消费最大值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=max)).rename(columns = {'MI_COST': prefixs + 'MI_COST_max'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家领满减券最低消费最小值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=min)).rename(columns = {'MI_COST': prefixs + 'MI_COST_min'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家领满减券最低消费平均数
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=n.mean)).rename(columns = {'MI_COST': prefixs + 'MI_COST_aver'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家领满减券最低消费中位数
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=n.median)).rename(columns = {'MI_COST': prefixs + 'MI_COST_median'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家领满减券减额最大值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = max)).rename(columns = {'JIAN': prefixs + "JIAN_max"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家领满减券减额最小值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = min)).rename(columns = {'JIAN': prefixs + "JIAN_min"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家领满减券减额平均值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.mean)).rename(columns = {'JIAN': prefixs + "JIAN_aver"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#用户+商家领满减券减额中位数
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.median)).rename(columns = {'JIAN': prefixs + "JIAN_median"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
tmp = data[keys+['DATE_RECEIVED']].sort_values(['DATE_RECEIVED'],ascending=True)
#用户+商家第一次领券
first = tmp.drop_duplicates(keys,keep="first")
first[prefixs+"is_first_received"]=1
l_f = p.merge(l_f,first,on=keys+['DATE_RECEIVED'],how="left")
#用户+商家最后一次领券
last = tmp.drop_duplicates(keys,keep="last")
last[prefixs+"is_last_received"] = 1
l_f = p.merge(l_f,last,on=keys+['DATE_RECEIVED'],how="left")
#################################用户+优惠券
keys=['User_id','Coupon_id']
prefixs='label_field_'+'_'.join(keys)+'_'
#用户+优惠券领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#################################用户+折扣率
keys=['User_id','DISCOUNT']
prefixs='label_field_'+'_'.join(keys)+'_'
#用户+折扣率领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
tmp = data[keys+['DATE_RECEIVED']].sort_values(['DATE_RECEIVED'],ascending=True)
#用户+折扣率第一次领券
first = tmp.drop_duplicates(keys,keep="first")
first[prefixs+"is_first_received"]=1
l_f = p.merge(l_f,first,on=keys+['DATE_RECEIVED'],how="left")
#用户+折扣率最后一次领券
last = tmp.drop_duplicates(keys,keep="last")
last[prefixs+"is_last_received"] = 1
l_f = p.merge(l_f,last,on=keys+['DATE_RECEIVED'],how="left")
#################################用户+日期
keys=['User_id','DATE_RECEIVED']
prefixs='label_field_'+'_'.join(keys)+'_'
#用户+日期领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#################################用户+商家+优惠券
keys=['User_id','Merchant_id','Coupon_id']
prefixs='label_field_'+'_'.join(keys)+'_'
#用户+商家+优惠券领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#################################用户+商家+日期
keys=['User_id','Merchant_id','DATE_RECEIVED']
prefixs='label_field_'+'_'.join(keys)+'_'
#用户+商家+日期领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#################################用户+优惠券+日期
keys=['User_id','Coupon_id','DATE_RECEIVED']
prefixs='label_field_'+'_'.join(keys)+'_'
#用户+优惠券+日期领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#################################商家
keys=['Merchant_id']
prefixs='label_field_'+'_'.join(keys)+'_'
#商家被领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#商家被领券的最大距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = max)).rename(columns = {'Distance': prefixs + "Distance_max"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领券的最小距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = min)).rename(columns = {'Distance': prefixs + "Distance_min"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领券的最小距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = n.mean)).rename(columns = {'Distance': prefixs + "Distance_aver"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领券的最小距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = n.median)).rename(columns = {'Distance': prefixs + "Distance_median"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家优惠券折扣率最大值
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = max)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_max'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家优惠券折扣率最小值
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = min)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_min'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家优惠券折扣率平均数
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = n.mean)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_mean'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家优惠券折扣率中位数
pivot = p.DataFrame(data.pivot_table( index = keys, values = 'DISCOUNT', aggfunc = n.median)).rename(columns = {'DISCOUNT': prefixs + 'DISCOUNT_median'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领满减券最低消费最大值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=max)).rename(columns = {'MI_COST': prefixs + 'MI_COST_max'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领满减券最低消费最小值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=min)).rename(columns = {'MI_COST': prefixs + 'MI_COST_min'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领满减券最低消费平均数
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=n.mean)).rename(columns = {'MI_COST': prefixs + 'MI_COST_aver'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领满减券最低消费中位数
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'MI_COST',aggfunc=n.median)).rename(columns = {'MI_COST': prefixs + 'MI_COST_median'}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领满减券减额最大值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = max)).rename(columns = {'JIAN': prefixs + "JIAN_max"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领满减券减额最小值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = min)).rename(columns = {'JIAN': prefixs + "JIAN_min"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领满减券减额平均值
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.mean)).rename(columns = {'JIAN': prefixs + "JIAN_aver"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#商家被领满减券减额中位数
pivot = p.DataFrame(data[data['MJ']==1].pivot_table(index = keys, values = 'JIAN', aggfunc = n.median)).rename(columns = {'JIAN': prefixs + "JIAN_median"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
tmp = data[keys+['DATE_RECEIVED']].sort_values(['DATE_RECEIVED'],ascending=True)
#商家被第一次领券
first = tmp.drop_duplicates(keys,keep="first")
first[prefixs+"is_first_received"]=1
l_f = p.merge(l_f,first,on=keys+['DATE_RECEIVED'],how="left")
#商家被最后一次领券
last = tmp.drop_duplicates(keys,keep="last")
last[prefixs+"is_last_received"] = 1
l_f = p.merge(l_f,last,on=keys+['DATE_RECEIVED'],how="left")
#################################商家+优惠券
keys=['Merchant_id','Coupon_id']
prefixs='label_field_'+'_'.join(keys)+'_'
#商家+优惠券领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#################################商家+折扣率
keys=['Merchant_id','DISCOUNT']
prefixs='label_field_'+'_'.join(keys)+'_'
#商家+折扣率领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
tmp = data[keys+['DATE_RECEIVED']].sort_values(['DATE_RECEIVED'],ascending=True)
#商家+折扣率第一次领券
first = tmp.drop_duplicates(keys,keep="first")
first[prefixs+"is_first_received"]=1
l_f = p.merge(l_f,first,on=keys+['DATE_RECEIVED'],how="left")
#商家+折扣率最后一次领券
last = tmp.drop_duplicates(keys,keep="last")
last[prefixs+"is_last_received"] = 1
l_f = p.merge(l_f,last,on=keys+['DATE_RECEIVED'],how="left")
#################################商家+日期
keys=['Merchant_id','DATE_RECEIVED']
prefixs='label_field_'+'_'.join(keys)+'_'
#商家+日期领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#################################商家+优惠券+日期
keys=['Merchant_id','Coupon_id','DATE_RECEIVED']
prefixs='label_field_'+'_'.join(keys)+'_'
#商家+优惠券+日期领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#################################优惠券
keys=['Coupon_id']
prefixs='label_field_'+'_'.join(keys)+'_'
#优惠券领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#优惠券被领券的最大距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = max)).rename(columns = {'Distance': prefixs + "Distance_max"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#优惠券被领券的最小距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = min)).rename(columns = {'Distance': prefixs + "Distance_min"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#优惠券被领券的最小距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = n.mean)).rename(columns = {'Distance': prefixs + "Distance_aver"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#优惠券被领券的最小距离
pivot = p.DataFrame(data.pivot_table(index = keys, values = 'Distance', aggfunc = n.median)).rename(columns = {'Distance': prefixs + "Distance_median"}).reset_index()
l_f = p.merge(l_f, pivot, on = keys, how = 'left')
#################################优惠券+日期
keys=['Coupon_id','DATE_RECEIVED']
prefixs='label_field_'+'_'.join(keys)+'_'
#优惠券+日期领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#################################折扣率
keys=['DISCOUNT']
prefixs='label_field_'+'_'.join(keys)+'_'
#折扣率 被领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
tmp = data[keys+['DATE_RECEIVED']].sort_values(['DATE_RECEIVED'],ascending=True)
#折扣率 被第一次领券
first = tmp.drop_duplicates(keys,keep="first")
first[prefixs+"is_first_received"]=1
l_f = p.merge(l_f,first,on=keys+['DATE_RECEIVED'],how="left")
#折扣率 被最后一次领券
last = tmp.drop_duplicates(keys,keep="last")
last[prefixs+"is_last_received"] = 1
l_f = p.merge(l_f,last,on=keys+['DATE_RECEIVED'],how="left")
#################################日期
keys=['DATE_RECEIVED']
prefixs='label_field_'+'_'.join(keys)+'_'
#当日领券数
pivot=p.DataFrame(data.pivot_table(index=keys,values='num',aggfunc=len)).rename(columns={'num':prefixs+'received'}).reset_index()
l_f=p.merge(l_f,pivot,on=keys,how='left')
#用户距离正反排序
l_f['label_User_distance_true_rank']=l_f.groupby('User_id')['Distance'].rank(ascending=True)
l_f['label_User_distance_False_rank']=l_f.groupby('User_id')['Distance'].rank(ascending=False)
#用户折扣正反排序
l_f['label_User_discount_rate_true_rank']=l_f.groupby('User_id')['DISCOUNT'].rank(ascending=True)
l_f['label_User_discount_rate_False_rank']=l_f.groupby('User_id')['DISCOUNT'].rank(ascending=False)
####
#商家距离正反排序
l_f['label_Merchant_distance_true_rank']=l_f.groupby('Merchant_id')['Distance'].rank(ascending=True)
l_f['label_Merchant_distance_False_rank']=l_f.groupby('Merchant_id')['Distance'].rank(ascending=False)
#商家折扣正反排序
l_f['label_Merchant_discount_rate_true_rank']=l_f.groupby('Merchant_id')['DISCOUNT'].rank(ascending=True)
l_f['label_Merchant_discount_rate_False_rank']=l_f.groupby('Merchant_id')['DISCOUNT'].rank(ascending=False)
#####
############################################~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#优惠券距离正反排序
l_f['label_Coupon_distance_true_rank']=l_f.groupby('Coupon_id')['Distance'].rank(ascending=True)
l_f['label_Coupon_distance_False_rank']=l_f.groupby('Coupon_id')['Distance'].rank(ascending=False)
#优惠券折扣正反排序
l_f['label_Coupon_discount_rate_true_rank']=l_f.groupby('Coupon_id')['DISCOUNT'].rank(ascending=True)
l_f['label_Coupon_discount_rate_False_rank']=l_f.groupby('Coupon_id')['DISCOUNT'].rank(ascending=False)
#优惠券领券日期正反排序
l_f['label_Coupon_date_received_true_rank']=l_f.groupby('Coupon_id')['DATE_RECEIVED'].rank(ascending=True)
l_f['label_Coupon_date_received_False_rank']=l_f.groupby('Coupon_id')['DATE_RECEIVED'].rank(ascending=False)
l_f.fillna(0,downcast='infer',inplace=True)
return l_f
def model_xgb(train,test):
params={
"booster": 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'silent': 0,#(静默模式,1开0关)
'eta': 0.01,#(0.01~0.2,,,0.01)
'max_depth': 5,#(3~10,,,6)
'min_child_weight': 1,
'gamma': 0,
'lambda': 1,
'colsample_bylevel': 0.7,#(作用与subsample相似)
'colsample_bytree': 0.7,#(0.5~1)
'subsample': 0.9,#(0.5~1)
'scale_pos_weight': 1,#(算法更快收敛)
}
#数据集
dtrain=xgb.DMatrix(train.drop(['User_id','Coupon_id','Merchant_id', 'Discount_rate', 'Date', 'DATE_RECEIVED','Date_received','label','DATE'],axis=1),label=train['label'])
dtest=xgb.DMatrix(test.drop(['User_id','Coupon_id','Merchant_id', 'Discount_rate', 'DATE_RECEIVED','Date_received'],axis=1))
#训练
watchlist=[(dtrain,'train')]
model=xgb.train(params,dtrain,3,watchlist)
#1300轮
#预测
predict=model.predict(dtest)
#结果
predict=p.DataFrame(predict,columns=['prob'])
result=p.concat([test[['User_id','Coupon_id','Date_received']],predict],axis=1)
#特征的重要性
feat_importance = p.DataFrame(columns=['feature_name', 'importance'])
feat_importance['feature_name'] = model.get_score().keys()
feat_importance['importance'] = model.get_score().values()
feat_importance.sort_values(['importance'], ascending=False, inplace=True)
return result,feat_importance
if __name__ =='__main__':
#原数据
raw_train=p.read_csv("D:/1A动手实践/Python/1O2O/ccf_offline_stage1_train.csv")
raw_test=p.read_csv("D:/1A动手实践/Python/1O2O/ccf_offline_stage1_test_revised.csv")
#预处理
prepr_train=prepr(raw_train)
prepr_test=prepr(raw_test)
#划分区间
#训练集 历史,中间,标签区间
train_history=prepr_train[prepr_train['DATE_RECEIVED'].isin(p.date_range('2016/3/2',periods=60))]
train_label=prepr_train[prepr_train['DATE_RECEIVED'].isin(p.date_range('2016/5/16',periods=31))]
#验证集 历史,中间,标签区间
verification_history=prepr_train[prepr_train['DATE_RECEIVED'].isin(p.date_range('2016/1/16',periods=60))]
verification_label=prepr_train[prepr_train['DATE_RECEIVED'].isin(p.date_range('2016/3/31',periods=31))]
#测试集 历史,中间,标签区间
test_history=prepr_train[prepr_train['DATE_RECEIVED'].isin(p.date_range('2016/4/17',periods=60))]
test_label=prepr_test.copy()
#构造数据集
complete_train=construct_data(train_history,train_label)
complete_verification=construct_data(verification_history,verification_label)
complete_test=construct_data(test_history,test_label)
good_train=p.concat([complete_train,complete_verification],axis=0)
result,feat_importance=model_xgb(good_train,complete_test)
result.to_csv("10-17-2.csv",index=False,header=None)
(学习赛)O2O优惠券使用预测
于 2021-07-27 07:20:10 首次发布