淘宝用户行为分析——用户画像

一、数据介绍

数据集是淘宝一个月的用户行为数据,数据包括user_id,item_id,behavior_type,user_geohash,item_category,time六个字段,共有100多万条记录,考虑数据集太大,为了提高运行效率,只随机抽取20%的数据;另外,由于数据集的局限,此项目的画像标签只是庞大用户画像的一部分,基于已有的数据集进行制作。

#导入库
%matplotlib inline
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import gc
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
#导入数据集
df_orginal = pd.read_csv('/home/kesci/input/mydata9388/taobao_persona.csv')

二、数据预处理

2.1 数据抽样

#数据集太大,为了提高运行效率,只随机抽取20%的数据
df = df_orginal.sample(frac=0.2,random_state=None)
#回收内存
del df_orginal
gc.collect()

2.2 缺失值处理

df.info()

<class ‘pandas.core.frame.DataFrame’>
Int64Index: 4658205 entries, 13077081 to 21758719
Data columns (total 6 columns):
user_id int64
item_id int64
behavior_type int64
user_geohash object
item_category int64
time object
dtypes: int64(4), object(2)
memory usage: 248.8+ MB

df.shape

(4658205, 6)

#查看各字段的缺失值数量
df.isnull().sum()

user_id 0
item_id 0
behavior_type 0
user_geohash 3183516
item_category 0
time 0
dtype: int64

  • 只有user_geohash有缺失值,且缺失的比例很高,无统计分析的意义,将此列删除
df.drop('user_geohash',axis=1,inplace=True)

2.3 日期与时段处理

#将time字段拆分为日期和时段
df['date'] = df['time'].str[0:10]
df['date'] = pd.to_datetime(df['date'],format='%Y-%m-%d')
df['time'] = df['time'].str[11:]
df['time'] = df['time'].astype(int)
#将时段分为'凌晨'、'上午'、'中午'、'下午'、'晚上'
df['hour'] = pd.cut(df['time'],bins=[-1,5,10,13,18,24],labels=['凌晨','上午','中午','下午','晚上'])

2.4 制作用户标签表

#生成用户标签表,制作好的标签都加入这个表中
users = df['user_id'].unique()
labels = pd.DataFrame(users,columns=['user_id'])

三、用户行为标签

3.1 用户浏览活跃时间段

#对用户和时段分组,统计浏览次数
time_browse = df[df['behavior_type']==1].groupby(['user_id','hour']).item_id.count().reset_index()

time_browse.rename(columns={'item_id':'hour_counts'},inplace=True)

#统计每个用户浏览次数最多的时段
time_browse_max = time_browse.groupby('user_id').hour_counts.max().reset_index()
time_browse_max.rename(columns={'hour_counts':'read_counts_max'},inplace=True)
time_browse = pd.merge(time_browse,time_browse_max,how='left',on='user_id')
#选取各用户浏览次数最多的时段,如有并列最多的时段,用逗号连接
time_browse_hour = time_browse.loc[time_browse['hour_counts']==time_browse['read_counts_max'],'hour'].groupby(time_browse['user_id']).aggregate(lambda x:','.join(x)).reset_index()
time_browse_hour.head()
user_idhour
0492晚上
13726晚上
219137晚上
336465下午
437101下午
#将用户浏览活跃时间段加入到用户标签表中
labels = pd.merge(labels,time_browse_hour,how='left',on='user_id')
labels.rename(columns={'hour':'time_browse'},inplace=True)

3.2 用户购买活跃时间段

#生成逻辑与浏览活跃时间段相同
time_buy = df[df['behavior_type']==4].groupby(['user_id','hour']).item_id.count().reset_index()
time_buy.rename(columns={'item_id':'hour_counts'},inplace=True)
time_buy_max = time_buy.groupby('user_id').hour_counts.max().reset_index()
time_buy_max.rename(columns={'hour_counts':'buy_counts_max'},inplace=True)
time_buy = pd.merge(time_buy,time_buy_max,how='left',on='user_id')
time_buy_hour = time_buy.loc[time_buy['hour_counts']==time_buy['buy_counts_max'],'hour'].groupby(time_buy['user_id']).aggregate(lambda x:','.join(x)).reset_index()
time_buy_hour.head()
user_idhour
038745中午
145561上午,中午
253394晚上
359436晚上
4100605凌晨
#将用户购买活跃时间段加入到用户标签表中
labels = pd.merge(labels,time_buy_hour,how='left',on='user_id')
labels.rename(columns={'hour':'time_buy'},inplace=True)
del time_browse
del time_buy
del time_browse_hour
del time_browse_max
del time_buy_hour
del time_buy_max
gc.collect()

168

3.3 关于类目的用户行为

df_browse = df.loc[df['behavior_type']==1,['user_id','item_id','item_category']]
df_collect = df.loc[df['behavior_type']==2,['user_id','item_id','item_category']]
df_cart = df.loc[df['behavior_type']==3,['user_id','item_id','item_category']]
df_buy = df.loc[df['behavior_type']==4,['user_id','item_id','item_category']]

3.3.1 浏览最多的类目

#对用户与类目进行分组,统计浏览次数
df_cate_most_browse = df_browse.groupby(['user_id','item_category']).item_id.count().reset_index()

df_cate_most_browse.rename(columns={'item_id':'item_category_counts'},inplace=True)

#统计每个用户浏览次数最多的类目
df_cate_most_browse_max = df_cate_most_browse.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_browse_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_browse = pd.merge(df_cate_most_browse,df_cate_most_browse_max,how='left',on='user_id')
df_cate_most_browse['item_category'] = df_cate_most_browse['item_category'].astype(str)
#选取各用户浏览次数最多的类目,如有并列最多的类目,用逗号连接
df_cate_browse = df_cate_most_browse.loc[df_cate_most_browse['item_category_counts']==df_cate_most_browse['item_category_counts_max'],'item_category'].groupby(df_cate_most_browse['user_id']).aggregate(lambda x:','.join(x)).reset_index()
df_cate_browse.head()
user_iditem_category
04926344
137265027
2191373695,3942
33646512997
4371011863
#将用户浏览最多的类目加入到用户标签表中
labels = pd.merge(labels,df_cate_browse,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_browse'},inplace=True)

3.3.2 收藏最多的类目

#生成逻辑与浏览最多的类目相同
df_cate_most_collect = df_collect.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_collect.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_collect_max = df_cate_most_collect.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_collect_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_collect = pd.merge(df_cate_most_collect,df_cate_most_collect_max,how='left',on='user_id')
df_cate_most_collect['item_category'] = df_cate_most_collect['item_category'].astype(str)
df_cate_collect = df_cate_most_collect.loc[df_cate_most_collect['item_category_counts']==df_cate_most_collect['item_category_counts_max'],'item_category'].groupby(df_cate_most_collect['user_id']).aggregate(lambda x:','.join(x)).reset_index()
df_cate_collect.head()
user_iditem_category
03646512997
13874510523
2455613783
35943611159
460723354,2939,6900,8270,8665,10242,11304,11991
labels = pd.merge(labels,df_cate_collect,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_collect'},inplace=True)

3.3.3 加购最多的类目

#生成逻辑与浏览最多的类目相同
df_cate_most_cart = df_cart.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_cart = df_cart.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_cart.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_cart_max = df_cate_most_cart.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_cart_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_cart = pd.merge(df_cate_most_cart,df_cate_most_cart_max,how='left',on='user_id')
df_cate_most_cart['item_category'] = df_cate_most_cart['item_category'].astype(str)
df_cate_cart = df_cate_most_cart.loc[df_cate_most_cart['item_category_counts']==df_cate_most_cart['item_category_counts_max'],'item_category'].groupby(df_cate_most_cart['user_id']).aggregate(lambda x:','.join(x)).reset_index()
df_cate_cart.head()
user_iditem_category
037266000
1371016344
2455611863,6648
3594362754
46179713230
labels = pd.merge(labels,df_cate_cart,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_cart'},inplace=True)

3.3.4 购买最多的类目

#生成逻辑与浏览最多的类目相同
df_cate_most_buy = df_buy.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_buy = df_buy.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_buy.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_buy_max = df_cate_most_buy.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_buy_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_buy = pd.merge(df_cate_most_buy,df_cate_most_buy_max,how='left',on='user_id')
df_cate_most_buy['item_category'] = df_cate_most_buy['item_category'].astype(str)
df_cate_buy = df_cate_most_buy.loc[df_cate_most_buy['item_category_counts']==df_cate_most_buy['item_category_counts_max'],'item_category'].groupby(df_cate_most_buy['user_id']).aggregate(lambda x:','.join(x)).reset_index()
df_cate_buy.head()
user_iditem_category
03874510556
1455616717,10559
25339413500
3594364370
4100605930,3783,11455
labels = pd.merge(labels,df_cate_buy,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_buy'},inplace=True)
del df_browse
del df_collect
del df_cart
del df_buy
del df_cate_most_browse
del df_cate_most_collect
del df_cate_most_buy
del df_cate_most_cart
del df_cate_most_browse_max
del df_cate_most_collect_max
del df_cate_most_cart_max
del df_cate_most_buy_max
del df_cate_browse
del df_cate_collect
del df_cate_cart
del df_cate_buy
gc.collect(0)

112

3.4 30天用户行为

  • 数据集中的数据正好是一个月,30天的数据即整个数据集的数据

3.4.1 近30天购买次数

#将购买行为按用户进行分组,统计次数
df_counts_30_buy = df[df['behavior_type']==4].groupby('user_id').item_id.count().reset_index()
df_counts_30_buy.head()
user_iditem_id
0387452
1455612
2533941
3594364
41006053
labels = pd.merge(labels,df_counts_30_buy,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_30_buy'},inplace=True)

3.4.2 近30天加购次数

#将加购行为按用户进行分组,统计次数
df_counts_30_cart = df[df['behavior_type']==3].groupby('user_id').item_id.count().reset_index()
df_counts_30_cart.head()
user_iditem_id
037261
1371011
24556122
3594369
4617975
labels = pd.merge(labels,df_counts_30_cart,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_30_cart'},inplace=True)

3.4.3 近30天活跃天数

#对用户进行分组,统计活跃的天数,包括浏览、收藏、加购、购买
counts_30_active = df.groupby('user_id')['date'].nunique()
counts_30_active.head()

user_id
492 11
3726 15
19137 6
36465 7
37101 19
Name: date, dtype: int64

labels = pd.merge(labels,counts_30_active,how='left',on='user_id')
labels.rename(columns={'date':'counts_30_active'},inplace=True)
del df_counts_30_buy
del df_counts_30_cart
del counts_30_active
gc.collect()

238

3.5 7天用户行为

#数据集中的最后日期是12月18号,统计11号之后的用户行为
df_near_7 = df[df['date']>datetime.strptime('2014-12-11', '%Y-%m-%d')]

3.5.1 近7天购买次数

df_counts_7_buy = df_near_7[df_near_7['behavior_type']==4].groupby('user_id').item_id.count().reset_index()
df_counts_7_buy.head()
user_iditem_id
0455612
1594361
21006052
31008901
41316942
labels = pd.merge(labels,df_counts_7_buy,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_7_buy'},inplace=True)

3.5.2 近7天加购次数

df_counts_7_cart = df_near_7[df_near_7['behavior_type']==3].groupby('user_id').item_id.count().reset_index()
df_counts_7_cart.head()
user_iditem_id
037261
1455619
2594367
31006052
41316943
labels = pd.merge(labels,df_counts_7_cart,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_7_cart'},inplace=True)

3.5.3 近7天活跃天数

counts_7_active = df_near_7.groupby('user_id')['date'].nunique()
counts_7_active.head()

user_id
492 4
3726 5
19137 1
36465 2
37101 5
Name: date, dtype: int64

labels = pd.merge(labels,counts_7_active,how='left',on='user_id')
labels.rename(columns={'date':'counts_7_active'},inplace=True)
del df_counts_7_buy
del df_counts_7_cart
del counts_7_active
gc.collect()

112

3.6 最后一次行为距今天数

3.6.1 上次浏览距今天数

days_browse = df[df['behavior_type']==1].groupby('user_id')['date'].max().apply(lambda x:(datetime.strptime('2014-12-19','%Y-%m-%d')-x).days)
days_browse.head()

user_id
492 1
3726 1
19137 7
36465 3
37101 3
Name: date, dtype: int64

labels = pd.merge(labels,days_browse,how='left',on='user_id')

labels.rename(columns={'date':'days_browse'},inplace=True)

3.6.2 上次加购距今天数

days_cart = df[df['behavior_type']==3].groupby('user_id')['date'].max().apply(lambda x:(datetime.strptime('2014-12-19','%Y-%m-%d')-x).days)
days_cart.head()

user_id
3726 1
37101 8
45561 1
59436 4
61797 18
Name: date, dtype: int64

labels = pd.merge(labels,days_cart,how='left',on='user_id')
labels.rename(columns={'date':'days_cart'},inplace=True)

3.6.3上次购买距今天数

days_buy = df[df['behavior_type']==4].groupby('user_id')['date'].max().apply(lambda x:(datetime.strptime('2014-12-19','%Y-%m-%d')-x).days)
days_buy.head()

user_id
38745 23
45561 6
53394 22
59436 7
100605 7
Name: date, dtype: int64

labels = pd.merge(labels,days_buy,how='left',on='user_id')
labels.rename(columns={'date':'days_buy'},inplace=True)

del days_browse
del days_buy
del days_cart
gc.collect()

42

3.7 最近两次购买间隔天数

df_interval_buy = df[df['behavior_type']==4].groupby(['user_id','date']).item_id.count().reset_index()

interval_buy = df_interval_buy.groupby('user_id')['date'].apply(lambda x:x.sort_values().diff(1).dropna().head(1)).reset_index()

interval_buy['date'] = interval_buy['date'].apply(lambda x : x.days)

interval_buy.drop('level_1',axis=1,inplace=True)

interval_buy.rename(columns={'date':'interval_buy'},inplace=True)
interval_buy.head()
user_idinterval_buy
0594362
11006053
21063622
31316943
41379079
labels = pd.merge(labels,interval_buy,how='left',on='user_id')

del df_interval_buy
gc.collect()

70

3.8是否浏览未下单

df_browse_buy = df.loc[(df['behavior_type']==1) | (df['behavior_type']==4),['user_id','item_id','behavior_type','time']]

browse_not_buy = pd.pivot_table(df_browse_buy,index=['user_id','item_id'],columns=['behavior_type'],values=['time'],aggfunc=['count'])

browse_not_buy.columns = ['browse','buy']

browse_not_buy.fillna(0,inplace=True)

browse_not_buy['browse_not_buy'] = 0

browse_not_buy.loc[(browse_not_buy['browse']>0) & (browse_not_buy['buy']==0),'browse_not_buy'] = 1

browse_not_buy = browse_not_buy.groupby('user_id')['browse_not_buy'].sum().reset_index()

browse_not_buy.head()
user_idbrowse_not_buy
049234
1372668
21913710
33646512
437101118
labels = pd.merge(labels,browse_not_buy,how='left',on='user_id')

labels['browse_not_buy'] = labels['browse_not_buy'].apply(lambda x: '是' if x>0 else '否')

3.9 是否加购未下单

df_cart_buy = df.loc[(df['behavior_type']==3) | (df['behavior_type']==4),['user_id','item_id','behavior_type','time']]
cart_not_buy = pd.pivot_table(df_cart_buy,index=['user_id','item_id'],columns=['behavior_type'],values=['time'],aggfunc=['count'])
cart_not_buy.columns = ['cart','buy']
cart_not_buy.fillna(0,inplace=True)
cart_not_buy['cart_not_buy'] = 0
cart_not_buy.loc[(cart_not_buy['cart']>0) & (cart_not_buy['buy']==0),'cart_not_buy'] = 1
cart_not_buy = cart_not_buy.groupby('user_id')['cart_not_buy'].sum().reset_index()
cart_not_buy.head()
user_idcart_not_buy
037261
1371011
2387450
34556122
4533940
labels = pd.merge(labels,cart_not_buy,how='left',on='user_id')
labels['cart_not_buy'] = labels['cart_not_buy'].apply(lambda x: '是' if x>0 else '否')

四、 用户属性标签

4.1 是否复购用户

buy_again = df[df['behavior_type']==4].groupby('user_id')['item_id'].count().reset_index()

buy_again.rename(columns={'item_id':'buy_again'},inplace=True)

buy_again.head()
user_idbuy_again
0387452
1455612
2533941
3594364
41006053
labels = pd.merge(labels,buy_again,how='left',on='user_id')

labels['buy_again'].fillna(-1,inplace=True)
#未购买的用户标记为‘未购买’,有购买未复购的用户标记为‘否’,有复购的用户标记为‘是’
labels['buy_again'] = labels['buy_again'].apply(lambda x: '是' if x>1 else  '否' if x==1 else '未购买')

4.2 访问活跃度

user_active_level = labels['counts_30_active'].value_counts().sort_index(ascending=False)

plt.figure(figsize=(16,9))
user_active_level.plot(title='30天内访问次数与访问人数的关系',fontsize=18)
plt.ylabel('访问人数',fontsize=14)
plt.xlabel('访问次数',fontsize=14)

Text(0.5, 0, ‘访问次数’)
在这里插入图片描述
**分析:**14次左右是个拐点,因此定义购买次数小于等于14次为低活跃,大于14次为高活跃,此定义只是从用户的分布角度出发,工作中当从业务出发定义是否活跃。

labels['buy_active_level'] = '高'
labels.loc[labels['counts_30_buy']<=14,'buy_active_level'] = '低'

4.3 购买的品类是否单一

buy_single = df[df['behavior_type']==4].groupby('user_id').item_category.nunique().reset_index()

buy_single.rename(columns={'item_category':'buy_single'},inplace=True)

labels = pd.merge(labels,buy_single,how='left',on='user_id')
labels['buy_single'].fillna(-1,inplace=True)
labels['buy_single'] = labels['buy_single'].apply(lambda x: '是' if x>1 else  '否' if x==1 else '未购买' )

4.4 用户价值分组(RFM)

last_buy_days = labels['days_buy'].value_counts().sort_index()

plt.figure(figsize=(16,9))
last_buy_days.plot(title='最后一次购买距今天数与购买人数的关系',fontsize=18)
plt.ylabel('购买人数',fontsize=14)
plt.xlabel('距今天数',fontsize=14)

Text(0.5, 0, ‘距今天数’)
在这里插入图片描述
注:访问异常的那天为双12

labels['buy_days_level'] = '高'
labels.loc[labels['days_buy']>8,'buy_days_level'] = '低'
labels['rfm_value'] = labels['buy_active_level'].str.cat(labels['buy_days_level'])
def trans_value(x):
    if x == '高高':
        return '重要价值客户'
    elif x == '低高':
        return '重要深耕客户'
    elif x == '高低':
        return '重要唤回客户'
    else: 
        return '即将流失客户'
labels['rfm'] = labels['rfm_value'].apply(trans_value)
labels.drop(['buy_days_level','rfm_value'],axis=1,inplace=True)
labels['rfm'].value_counts()

重要深耕客户 7167
重要价值客户 7142
即将流失客户 5631
重要唤回客户 16
Name: rfm, dtype: int64

user_idtime_browsetime_buycate_most_browsecate_most_collectcate_most_cartcate_most_buycounts_30_buycounts_30_cartcounts_30_activedays_cartdays_buyinterval_buybrowse_not_buycart_not_buybuy_againuser_active_levelbuy_active_levelbuy_singlerfm
034517089中午上午11241118246036031.03.02418.018.0NaN即将流失客户
1136592808下午下午70981087979577957,9772,127533.05.0216.02.01.0重要深耕客户
2117599174上午凌晨,上午,下午,晚上589460546936,8291,91022949,5550,10258,129824.03.0252.07.04.0重要深耕客户
338516732晚上下午969337839397,139261050011.041.0292.02.03.0重要深耕客户
429120381晚上NaN6513NaN3944NaNNaN4.0132.0NaNNaN未购买未购买重要价值客户

接下来是对用户画像的详细分析,包括用户个性化标签、用户偏好标签、群体偏好标签,涉及到TF-IDF算法、余弦相似度算法

  • 首先,对数据进行预处理
#提取日期
df_orginal['time'] = df_orginal['time'].str[0:10]
#填充字段'user_geohash',作为下一步groupby的计数字段
df_orginal['user_geohash'].fillna('1',inplace=True)
#对所有数据按'user_id','item_id','behavior_type','item_category','time'进行分组
df = df_orginal.groupby(['user_id','item_id','behavior_type','item_category','time'])['user_geohash'].count().reset_index()
df.rename(columns={'user_geohash':'behavior_count'},inplace=True)
df.head()
user_iditem_idbehavior_typeitem_categorytimebehavior_count
0492254885163442014-12-079
1492254885363442014-12-071
2492254885463442014-12-071
34922316002162472014-12-093
44923473697124132014-12-122
#回收内存
del df_orginal
gc.collect()
df['time'] = pd.to_datetime(df['time'])

五、用户个性化标签

5.1 应用TF-IDF算法计算标签权重

# 计算每个用户身上每个标签的个数
df_tag_weight_tfidf_01_01 = df.groupby(['user_id','item_id'])['time'].count().reset_index()
df_tag_weight_tfidf_01_01.rename(columns={'time':'weight_m_p'},inplace=True)

# 计算每个用户身上的标签总数
df_tag_weight_tfidf_01_02 = df.groupby(['user_id'])['time'].count().reset_index()
df_tag_weight_tfidf_01_02.rename(columns={'time':'weight_m_s'},inplace=True)
df_tag_weight_tfidf_01 = pd.merge(df_tag_weight_tfidf_01_01,df_tag_weight_tfidf_01_02,how='left',on='user_id')
# 每个标签的行为数
df_tag_weight_tfidf_02 = df_tag_weight_tfidf_01.groupby(['item_id']).weight_m_p.sum().reset_index()
df_tag_weight_tfidf_02.rename(columns={'weight_m_p':'weight_w_p'},inplace=True)
# 所有标签的总和
df_tag_weight_tfidf_02['weight_w_s'] = df_tag_weight_tfidf_01['weight_m_p'].sum()
df_tag_weight_tfidf_03 = pd.merge(df_tag_weight_tfidf_01,df_tag_weight_tfidf_02,how='left',on='item_id')

# 应用TF-IDF计算标签权重
df_tag_weight_tfidf_03['tfidf_ratio'] = (df_tag_weight_tfidf_03['weight_m_p']/df_tag_weight_tfidf_03['weight_m_s'])*(np.log10(df_tag_weight_tfidf_03['weight_w_s']/df_tag_weight_tfidf_03['weight_w_p']))
df = pd.merge(df,df_tag_weight_tfidf_03[['user_id','item_id','tfidf_ratio']],how='left',on=['user_id','item_id']).reset_index(drop=True)

del df_tag_weight_tfidf_01_01
del df_tag_weight_tfidf_01_02
del df_tag_weight_tfidf_01
del df_tag_weight_tfidf_02
del df_tag_weight_tfidf_03
gc.collect()

101

5.2 建立行为类型权重维表

浏览行为,权重0.3
收藏行为,权重0.5
加购行为,权重1
购买行为,权重1.5

df['act_weight_plan'] = 0.3
df.loc[df['behavior_type']==2,'act_weight_plan']=0.5
df.loc[df['behavior_type']==3,'act_weight_plan']=1
df.loc[df['behavior_type']==4,'act_weight_plan']=1.5

5.3 计算用户标签权重

#标签权重衰减函数
#本项目中,加购行为的权重不随着时间的增长而衰减,而购买、浏览、收藏随着时间的推移,其对当前的参考性越来越弱,因此权重会随着时间的推移越来越低
def weight_time_reduce(act_date):
    date_interval = datetime.strptime('2014-12-19', '%Y-%m-%d') - act_date
    date_interval = date_interval.days
    time_reduce_ratio = np.exp(date_interval*(-0.1556))
    return time_reduce_ratio
df['time_reduce_ratio'] = 1
df.loc[df['behavior_type']!=3,'time_reduce_ratio'] = df.loc[df['time_reduce_ratio']!=3,'time'].apply(lambda x:weight_time_reduce(x))

# 标签总权重 = 行为类型权重*衰减系数*行为数*TFIDF标签权重
df['act_weight'] = df['act_weight_plan']*df['time_reduce_ratio']*df['behavior_count']*df['tfidf_ratio']
df.head(5)
user_iditem_idbehavior_typeitem_categorytimebehavior_counttfidf_ratioact_weight_plantime_reduce_ratioact_weight
0492254885163442014-12-0790.1493190.30.1545560.062311
1492254885363442014-12-0710.1493191.01.0000000.149319
2492254885463442014-12-0710.1493191.50.1545560.034617
34922316002162472014-12-0930.0629620.30.2109780.011955
44923473697124132014-12-1220.0602740.30.3364860.012169

六、用户偏好标签

# 要计算两两标签的相似性,计算量太大,服务器性能有限,在此只选取有过购买的标签
user_tag_public = df[df['behavior_type']==4]
user_tag_01 = user_tag_public[['user_id','item_id']]
#user_tag_02 = df.loc[df['behavior_type']==4,['user_id','item_id']]

6.1 计算两两标签共同对应的用户数

# 将两表正交,得到每个用户下,其所有标签的的两两组合
user_tag_02 = pd.merge(user_tag_01,user_tag_01,on='user_id')

# 删除重复值,即同一用户由上述正交得到的数据表中,两个标签为同一标签的数据
user_tag_03 = user_tag_02.drop(labels=user_tag_02[user_tag_02['item_id_x']==user_tag_02['item_id_y']].index,axis=0)
# 用两个标签分组,计算用户数,即每两个标签同时出现在不同的用户中的个数
user_tag = user_tag_03.groupby(['item_id_x','item_id_y'])['user_id'].count().reset_index()
user_tag.rename(columns={'user_id':'counts_common'},inplace=True)

6.2 计算每个标签对应的用户数

# 计算每一个标签对应的不同的用户数,即每个标签出现在不同的用户中的个数
user_tag_05 = user_tag_01.groupby(['item_id'])['user_id'].nunique().reset_index()
user_tag_05.rename(columns={'user_id':'counts_item_user'},inplace=True)
# 计算标签1有关的用户数
user_tag = pd.merge(user_tag,user_tag_05,how='left',left_on='item_id_x',right_on='item_id').drop('item_id_x',axis=1)
user_tag.rename(columns={'counts_item_user':'counts_item_x','item_id':'item_id_x'},inplace=True)
# 计算标签2有关的用户数
user_tag = pd.merge(user_tag,user_tag_05,how='left',left_on='item_id_y',right_on='item_id').drop('item_id_y',axis=1)
user_tag.rename(columns={'counts_item_user':'counts_item_y','item_id':'item_id_y'},inplace=True)

6.3 计算两两标签之间的相似性

# 余弦相似度计算两两标签的相关性
user_tag['power'] = user_tag['counts_common'] / np.sqrt(user_tag['counts_item_x']*user_tag['counts_item_y'])

6.4 对每个用户的历史标签权重加总

# 对用户、标签进行分组,计算每个用户每个标签的权重和
user_tag_06 = user_tag_public.groupby(['user_id','item_id'])['act_weight'].sum().reset_index()

6.5 计算推荐给用户的相关标签

# 将用户与所有与其有关的标签作对应
user_peasona_tag = pd.merge(user_tag_06,user_tag,how='left',left_on='item_id',right_on='item_id_x').drop('item_id',axis=1)

del user_tag_01
del user_tag_02
del user_tag_03
del user_tag_05
del user_tag_06
gc.collect()

77

# 计算推荐得分值  得分值 = 行为权重*相关性
user_peasona_tag['recommend'] = user_peasona_tag['act_weight']*user_peasona_tag['power']

# 对所有数据按得分值排序,再按’user_id'分组,得到每个用户有关的得分值最高的10个标签
user_peasona_tag_total = user_peasona_tag.sort_values('recommend', ascending=False).groupby(['user_id']).head(10)
user_peasona_tag_total.head(10)
user_idact_weightcounts_commonitem_id_xcounts_item_xitem_id_ycounts_item_ypowerrecommend
941565166805026.132241176.0354309086.01.0337331963.02.0124.450793763.162218
941547166805022.591208176.0337331963.02.0354309086.01.0124.450793322.477880
941558166805026.13224122.0354309086.01.0177149139.01.022.000000134.909295
4517695615456561.39303688.0131284675.01.0250218619.01.088.000000122.587185
4517697615456561.39303677.0131284675.01.0270996967.01.077.000000107.263787
4517934615456560.89071488.0250218619.01.0131284675.01.088.00000078.382824
4517685615456561.39303655.0131284675.01.036667020.01.055.00000076.616990
529309112799645.63994719.0168681351.02.0395358462.01.013.43502975.772847
529307112799645.63994719.0168681351.02.0379008813.01.013.43502975.772847
529306112799645.63994719.0168681351.02.0375010017.01.013.43502975.772847
del user_peasona_tag
del user_peasona_tag_total
gc.collect()

7

七、群体用户画像标签

需要先对用户人群进行分类,为了降低复杂性并实现群体用户画像标签的设计,暂时先随机对用户进行指定性别,以后有时间将对上部划分出来的用户群体进行画像标签设计

7.1 随机指定性别并划分群体

user = pd.DataFrame(df['user_id'].unique(),columns=['user_id'])

user['sex'] = np.random.randint(0,2,(20000))
user.loc[user['sex']==1,'sex'] = '男'
user.loc[user['sex']==0,'sex'] = '女'
df_group = pd.merge(df[['user_id','item_id','act_weight']],user,how='left',on='user_id')

del df
gc.collect()

36

df_group.head(5)
user_iditem_idact_weightsex
04922548850.062311
14922548850.149319
24922548850.034617
349223160020.011955
449234736970.012169

7.2 使用TF-IDF计算不同人群的标签偏好

# 计算每个性别、每个标签的权重加总
df_group_weight_tfidf_01_01 = df_group.groupby(['sex','item_id'])['act_weight'].sum().reset_index()

df_group_weight_tfidf_01_01.head(5)
sexitem_idact_weight
0640.008926
12700.000093
23910.000333
36680.000369
48690.000324
df_group_weight_tfidf_01_01.rename(columns={'act_weight':'weight_m_p'},inplace=True)
# 计算每个性别的所有标签的权重加总
df_group_weight_tfidf_01_02 = df_group.groupby(['sex'])['act_weight'].sum().reset_index()
df_group_weight_tfidf_01_02.rename(columns={'act_weight':'weight_m_s'},inplace=True)
df_group_weight_tfidf_01 = pd.merge(df_group_weight_tfidf_01_01,df_group_weight_tfidf_01_02,how='left',on='sex')
df_group_weight_tfidf_01.head(5)
sexitem_idweight_m_pweight_m_s
0640.00892626497.333993
12700.00009326497.333993
23910.00033326497.333993
36680.00036926497.333993
48690.00032426497.333993
# 计算每个标签的权重加总
df_group_weight_tfidf_02 = df_group_weight_tfidf_01.groupby(['item_id'])['weight_m_p'].sum().reset_index()
df_group_weight_tfidf_02.rename(columns={'weight_m_p':'weight_w_p'},inplace=True)
# 计算所有标签的权重加总
df_group_weight_tfidf_02['weight_w_s'] = df_group_weight_tfidf_01['weight_m_p'].sum()
df_group_weight_tfidf_02.head(5)
item_idweight_w_pweight_w_s
0370.00698153319.602753
1640.00918053319.602753
21770.00002853319.602753
32700.00009353319.602753
43680.00035853319.602753
df_group_weight_tfidf_03 = pd.merge(df_group_weight_tfidf_01,df_group_weight_tfidf_02,how='left',on='item_id')
# 使用TF-IDF算法计算每个性别对每个标签的偏好权重值
df_group_weight_tfidf_03['tfidf_ratio'] = (df_group_weight_tfidf_03['weight_m_p']/df_group_weight_tfidf_03['weight_m_s'])*(df_group_weight_tfidf_03['weight_w_s']/df_group_weight_tfidf_03['weight_w_p'])

del df_group
del df_group_weight_tfidf_01_01
del df_group_weight_tfidf_01_02
del df_group_weight_tfidf_01
del df_group_weight_tfidf_02
gc.collect()

34

# 对所有数据按得分值排序,再按性别分组,得到每个性别得分值最高的10个标签
df_group_weight = df_group_weight_tfidf_03.sort_values('tfidf_ratio', ascending=False).groupby(['sex']).head(10)
df_group_weight.head(15)
sex item_idweight_m_pweight_m_sweight_w_pweight_w_stfidf_ratio
19490402855314250.00184926497.3339930.00184953319.602753
8394821230927220.01312426497.3339930.01312453319.602753
8394601230890140.00058826497.3339930.00058853319.602753
19765912895704260.00045126497.3339930.00045153319.602753
8394631230897700.00000226497.3339930.000002
19765882895701300.00014326497.3339930.00014353319.602753
19765872895700630.00019626497.3339930.00019653319.602753
19765862895698450.00001026497.3339930.00001053319.602753
19765852895696470.00078226497.3339930.00078253319.602753
19765812895691880.08979926497.3339930.08979953319.602753
35215791115620940.00004626822.2687610.00004653319.602753
52920533717659330.00001226822.2687610.00001253319.602753
43717782363986240.00020626822.2687610.00020653319.602753
44371792460074420.00715126822.2687610.00715153319.602753
51746633545266340.00020926822.2687610.00020953319.602753
  • 15
    点赞
  • 186
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值