用户收藏最多的商品类别
# 过程与上述类似
df_collect = df.loc[df['behavior_type']==2,['user_id','item_id','item_category']]
df_cate_most_collect = df_collect.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_collect.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_collect_max = df_cate_most_collect.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_collect_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_collect = pd.merge(df_cate_most_collect,df_cate_most_collect_max,how='left',on='user_id')
df_cate_most_collect['item_category'] = df_cate_most_collect['item_category'].astype(str)
df_cate_collect = df_cate_most_collect.loc[
df_cate_most_collect['item_category_counts']==df_cate_most_collect[
'item_category_counts_max'],'item_category'].groupby(
df_cate_most_collect['user_id']).aggregate(lambda x:','.join(x)).reset_index()
labels = pd.merge(labels,df_cate_collect,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_collect'},inplace=True)
labels.head()
用户加购物车最多的商品类别
# 同上
df_cart = df.loc[df['behavior_type']==3,['user_id','item_id','item_category']]
df_cate_most_cart = df_cart.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_cart.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_cart_max = df_cate_most_cart.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_cart_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_cart = pd.merge(df_cate_most_cart,df_cate_most_cart_max,how='left',on='user_id')
df_cate_most_cart['item_category'] = df_cate_most_cart['item_category'].astype(str)
df_cate_cart = df_cate_most_cart.loc[df_cate_most_cart['item_category_counts']==df_cate_most_cart['item_category_counts_max'],'item_category'].groupby(df_cate_most_cart['user_id']).aggregate(lambda x:','.join(x)).reset_index()
labels = pd.merge(labels,df_cate_cart,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_cart'},inplace=True)
labels.head()
用户购买最多的商品类别
# 同上
df_buy = df.loc[df['behavior_type']==4,['user_id','item_id','item_category']]
df_cate_most_buy = df_buy.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_buy.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_buy_max = df_cate_most_buy.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_buy_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_buy = pd.merge(df_cate_most_buy,df_cate_most_buy_max,how='left',on='user_id')
df_cate_most_buy['item_category'] = df_cate_most_buy['item_category'].astype(str)
df_cate_buy = df_cate_most_buy.loc[df_cate_most_buy['item_category_counts']==df_cate_most_buy['item_category_counts_max'],'item_category'].groupby(df_cate_most_buy['user_id']).aggregate(lambda x:','.join(x)).reset_index()
labels = pd.merge(labels,df_cate_buy,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_buy'},inplace=True)
labels.head()
关于时间的用户行为标签
# 近30天购买次数
# 将购买行为按用户进行分组,统计次数
df_counts_30_buy = df[df['behavior_type']==4].groupby('user_id').item_id.count().reset_index()
labels = pd.merge(labels,df_counts_30_buy,how='left',on='user_id').rename(columns={'item_id':'counts_30_buy'})
labels.head()
df_counts_30_cart = df[df['behavior_type']==3].groupby('user_id').item_id.count().reset_index()
labels = pd.merge(labels,df_counts_30_cart,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_30_cart'},inplace=True)
counts_30_active = df.groupby('user_id')['date'].agg(['nunique']).reset_index()
labels = pd.merge(labels,counts_30_active,how='left',on='user_id')
labels.rename(columns={'nunique':'counts_30_active'},inplace=True)
labels.head()
# 取出最后7天的数据
df_near_7 = df[df['date']>datetime.strptime('2014-12-11','%Y-%m-%d')]
# 近7天购买次数
df_counts_7_buy = df_near_7[df_near_7['behavior_type']==4].groupby('user_id').item_id.count().reset_index()
labels = pd.merge(labels,df_counts_7_buy,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_7_buy'},inplace=True)
labels.head(10)
# 近七天加购物车次数
df_counts_7_cart = df_near_7[df_near_7['behavior_type']==3].groupby('user_id').item_id.count().reset_index()
labels = pd.merge(labels,df_counts_7_cart,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_7_cart'},inplace=True)
labels.head(10)
# 近7天活跃天数
counts_7_active = df_near_7.groupby('user_id')['date'].agg(['nunique']).reset_index()
labels = pd.merge(labels,counts_7_active,how='left',on='user_id')
labels.rename(columns={'nunique':'counts_7_active'},inplace=True)
labels.head(10)
# 最后一次加购物车距今的天数
days_cart = df[df['behavior_type']==3].groupby('user_id')['date'].max().agg([lambda x:(datetime.strptime('2014-12-19','%Y-%m-%d')-x).days]).reset_index()
labels= pd.merge(labels,days_cart,how='left',on='user_id')
labels.rename(columns={'<lambda>':'days_cart'},inplace=True)
labels.head()