电商用户画像标签
% matplotlib inline
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import gc
import warnings
warnings. filterwarnings( 'ignore' )
from datetime import datetime
df_orginal = pd. read_csv( '/home/kesci/input/mydata9388/taobao_persona.csv' )
数据预处理
数据抽样
df = df_orginal. sample( frac= 0.2 , random_state= None )
del df_orginal
gc. collect( )
缺失值处理
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4658205 entries, 13077081 to 21758719
Data columns (total 6 columns):
user_id int64
item_id int64
behavior_type int64
user_geohash object
item_category int64
time object
dtypes: int64(4), object(2)
memory usage: 248.8+ MB
df. shape
(4658205, 6)
df. isnull( ) . sum ( )
user_id 0
item_id 0
behavior_type 0
user_geohash 3183516
item_category 0
time 0
dtype: int64
日期与时段设置
df[ 'date' ] = df[ 'time' ] . str [ 0 : 10 ]
df[ 'date' ] = pd. to_datetime( df[ 'date' ] , format = '%Y-%m-%d' )
df[ 'time' ] = df[ 'time' ] . str [ 11 : ]
df[ 'time' ] = df[ 'time' ] . astype( int )
df[ 'hour' ] = pd. cut( df[ 'time' ] , bins= [ - 1 , 5 , 10 , 13 , 18 , 24 ] , labels= [ '凌晨' , '上午' , '中午' , '下午' , '晚上' ] )
制作用户标签表
users = df[ 'user_id' ] . unique( )
labels = pd. DataFrame( users, columns= [ 'user_id' ] )
用户行为标签
用户浏览活跃时间段
time_browse = df[ df[ 'behavior_type' ] == 1 ] . groupby( [ 'user_id' , 'hour' ] ) . item_id. count( ) . reset_index( )
time_browse. rename( columns= { 'item_id' : 'hour_counts' } , inplace= True )
time_browse_max = time_browse. groupby( 'user_id' ) . hour_counts. max ( ) . reset_index( )
#更改列名
time_browse_max.rename(columns={'hour_counts':'read_counts_max'},inplace=True)
time_browse = pd. merge( time_browse, time_browse_max, how= 'left' , on= 'user_id' )
#选取各用户浏览次数最多的时段,如有并列最多的时段,用逗号连接
time_browse_hour = time_browse.loc[time_browse['hour_counts']==time_browse['read_counts_max'],'hour'].groupby(time_browse['user_id']).aggregate(lambda x:','.join(x)).reset_index()
labels = pd. merge( labels, time_browse_hour, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'hour' : 'time_browse' } , inplace= True )
用户购买活跃时间
#生成逻辑与浏览活跃时间段相同
#先统计各个时段
time_buy = df[df['behavior_type']==4].groupby(['user_id','hour']).item_id.count().reset_index()
time_buy.rename(columns={'item_id':'hour_counts'},inplace=True)
#选择最大的时段
time_buy_max = time_buy.groupby('user_id').hour_counts.max().reset_index()
time_buy_max.rename(columns={'hour_counts':'buy_counts_max'},inplace=True)
#左连接
time_buy = pd.merge(time_buy,time_buy_max,how='left',on='user_id')
#选取各用户浏览次数最多的时段,如有并列最多的时段,用逗号连接
time_buy_hour = time_buy.loc[time_buy['hour_counts']==time_buy['buy_counts_max'],'hour'].groupby(time_buy['user_id']).aggregate(lambda x:','.join(x)).reset_index()
#将用户购买活跃时间段加入到用户标签表中
labels = pd.merge(labels,time_buy_hour,how='left',on='user_id')
labels.rename(columns={'hour':'time_buy'},inplace=True)
#del 删除变量释放内存
del time_browse
del time_buy
del time_browse_hour
del time_browse_max
del time_buy_hour
del time_buy_max
gc.collect()
关于类目的用户行为
#定义浏览、收藏、加购、购买 df.loc用来选取部分数据集,效率会比直接df[['behavior']]快
df_browse = df.loc[df['behavior_type']==1,['user_id','item_id','item_category']]
df_collect = df.loc[df['behavior_type']==2,['user_id','item_id','item_category']]
df_cart = df.loc[df['behavior_type']==3,['user_id','item_id','item_category']]
df_buy = df.loc[df['behavior_type']==4,['user_id','item_id','item_category']]
浏览最多的类目
df_cate_most_browse = df_browse. groupby( [ 'user_id' , 'item_category' ] ) . item_id. count( ) . reset_index( )
df_cate_most_browse. rename( columns= { 'item_id' : 'item_category_counts' } , inplace= True )
df_cate_most_browse_max = df_cate_most_browse. groupby( 'user_id' ) . item_category_counts. max ( ) . reset_index( )
df_cate_most_browse_max. rename( columns= { 'item_category_counts' : 'item_category_counts_max' } , inplace= True )
df_cate_most_browse[ 'item_category' ] = df_cate_most_browse[ 'item_category' ] . astype( str ) 、
df_cate_browse = df_cate_most_browse. loc[ df_cate_most_browse[ 'item_category_counts' ] == df_cate_most_browse[ 'item_category_counts_max' ] , 'item_category' ] . groupby( df_cate_most_browse[ 'user_id' ] ) . aggregate( lambda x: ',' . join( x) ) . reset_index( )
In [ 33 ] :
labels = pd. merge( labels, df_cate_browse, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'item_category' : 'cate_most_browse' } , inplace= True )
收藏最多的类目
df_cate_most_collect = df_collect. groupby( [ 'user_id' , 'item_category' ] ) . item_id. count( ) . reset_index( )
df_cate_most_collect. rename( columns= { 'item_id' : 'item_category_counts' } , inplace= True )
df_cate_most_collect_max = df_cate_most_collect. groupby( 'user_id' ) . item_category_counts. max ( ) . reset_index( )
df_cate_most_collect_max. rename( columns= { 'item_category_counts' : 'item_category_counts_max' } , inplace= True )
df_cate_most_collect = pd. merge( df_cate_most_collect, df_cate_most_collect_max, how= 'left' , on= 'user_id' )
df_cate_most_collect[ 'item_category' ] = df_cate_most_collect[ 'item_category' ] . astype( str )
df_cate_collect = df_cate_most_collect. loc[ df_cate_most_collect[ 'item_category_counts' ] == df_cate_most_collect[ 'item_category_counts_max' ] , 'item_category' ] . groupby( df_cate_most_collect[ 'user_id' ] ) . aggregate( lambda x: ',' . join( x) ) . reset_index( )
labels = pd. merge( labels, df_cate_collect, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'item_category' : 'cate_most_collect' } , inplace= True )
加购最多的类目
#生成逻辑与浏览最多的类目相同
df_cate_most_cart = df_cart.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_cart = df_cart.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_cart.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_cart_max = df_cate_most_cart.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_cart_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_cart = pd.merge(df_cate_most_cart,df_cate_most_cart_max,how='left',on='user_id')
df_cate_most_cart['item_category'] = df_cate_most_cart['item_category'].astype(str)
df_cate_cart = df_cate_most_cart.loc[df_cate_most_cart['item_category_counts']==df_cate_most_cart['item_category_counts_max'],'item_category'].groupby(df_cate_most_cart['user_id']).aggregate(lambda x:','.join(x)).reset_index()
labels = pd.merge(labels,df_cate_cart,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_cart'},inplace=True)
购买最多的类目
df_cate_most_buy = df_buy. groupby( [ 'user_id' , 'item_category' ] ) . item_id. count( ) . reset_index( )
df_cate_most_buy = df_buy. groupby( [ 'user_id' , 'item_category' ] ) . item_id. count( ) . reset_index( )
df_cate_most_buy. rename( columns= { 'item_id' : 'item_category_counts' } , inplace= True )
df_cate_most_buy_max = df_cate_most_buy. groupby( 'user_id' ) . item_category_counts. max ( ) . reset_index( )
df_cate_most_buy_max. rename( columns= { 'item_category_counts' : 'item_category_counts_max' } , inplace= True )
df_cate_most_buy = pd. merge( df_cate_most_buy, df_cate_most_buy_max, how= 'left' , on= 'user_id' )
df_cate_most_buy[ 'item_category' ] = df_cate_most_buy[ 'item_category' ] . astype( str )
df_cate_buy = df_cate_most_buy. loc[ df_cate_most_buy[ 'item_category_counts' ] == df_cate_most_buy[ 'item_category_counts_max' ] , 'item_category' ] . groupby( df_cate_most_buy[ 'user_id' ] ) . aggregate( lambda x: ',' . join( x) ) . reset_index( )
labels = pd. merge( labels, df_cate_buy, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'item_category' : 'cate_most_buy' } , inplace= True )
del df_browse
del df_collect
del df_cart
del df_buy
del df_cate_most_browse
del df_cate_most_collect
del df_cate_most_buy
del df_cate_most_cart
del df_cate_most_browse_max
del df_cate_most_collect_max
del df_cate_most_cart_max
del df_cate_most_buy_max
del df_cate_browse
del df_cate_collect
del df_cate_cart
del df_cate_buy
gc. collect( 0 )
30天用户行为
近30天购买次数
df_counts_30_buy = df[ df[ 'behavior_type' ] == 4 ] . groupby( 'user_id' ) . item_id. count( ) . reset_index( )
labels = pd. merge( labels, df_counts_30_buy, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'item_id' : 'counts_30_buy' } , inplace= True )
近30天加购次数
df_counts_30_cart = df[ df[ 'behavior_type' ] == 3 ] . groupby( 'user_id' ) . item_id. count( ) . reset_index( )
labels = pd. merge( labels, df_counts_30_cart, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'item_id' : 'counts_30_cart' } , inplace= True )
近30天活跃次数
#对用户进行分组,统计活跃的天数,包括浏览、收藏、加购、购买
counts_30_active = df.groupby('user_id')['date'].nunique()
labels = pd.merge(labels,counts_30_active,how='left',on='user_id')
labels.rename(columns={'date':'counts_30_active'},inplace=True)
del df_counts_30_buy
del df_counts_30_cart
del counts_30_active
gc.collect()
7天用户行为
df_near_7 = df[ df[ 'date' ] > datetime. strptime( '2014-12-11' , '%Y-%m-%d' ) ]
近7天购买次数
df_counts_7_buy = df_near_7[ df_near_7[ 'behavior_type' ] == 4 ] . groupby( 'user_id' ) . item_id. count( ) . reset_index( )
labels = pd. merge( labels, df_counts_7_buy, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'item_id' : 'counts_7_buy' } , inplace= True )
近7天加购次数
df_counts_7_cart = df_near_7[ df_near_7[ 'behavior_type' ] == 3 ] . groupby( 'user_id' ) . item_id. count( ) . reset_index( )
labels = pd. merge( labels, df_counts_7_cart, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'item_id' : 'counts_7_cart' } , inplace= True )
近7天活跃次数
counts_7_active = df_near_7.groupby('user_id')['date'].nunique()
labels = pd.merge(labels,counts_7_active,how='left',on='user_id')
labels.rename(columns={'date':'counts_7_active'},inplace=True)
del df_counts_7_buy
del df_counts_7_cart
del counts_7_active
gc.collect()
最后一次行为距今天数
上次浏览距今天数
days_browse = df[ df[ 'behavior_type' ] == 1 ] . groupby( 'user_id' ) [ 'date' ] . max ( ) . apply ( lambda x: ( datetime. strptime( '2014-12-19' , '%Y-%m-%d' ) - x) . days)
labels = pd. merge( labels, days_browse, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'date' : 'days_browse' } , inplace= True )
上次加购距今天数
days_cart = df[ df[ 'behavior_type' ] == 3 ] . groupby( 'user_id' ) [ 'date' ] . max ( ) . apply ( lambda x: ( datetime. strptime( '2014-12-19' , '%Y-%m-%d' ) - x) . days)
labels = pd. merge( labels, days_cart, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'date' : 'days_cart' } , inplace= True )
上次购买距今天数
days_buy = df[ df[ 'behavior_type' ] == 4 ] . groupby( 'user_id' ) [ 'date' ] . max ( ) . apply ( lambda x: ( datetime. strptime( '2014-12-19' , '%Y-%m-%d' ) - x) . days)
labels = pd. merge( labels, days_buy, how= 'left' , on= 'user_id' )
labels. rename( columns= { 'date' : 'days_buy' } , inplace= True )
del days_browse
del days_buy
del days_cart
gc. collect( )
最后两次购买间隔天数
df_interval_buy = df[ df[ 'behavior_type' ] == 4 ] . groupby( [ 'user_id' , 'date' ] ) . item_id. count( ) . reset_index( )
interval_buy = df_interval_buy. groupby( 'user_id' ) [ 'date' ] . apply ( lambda x: x. sort_values( ) . diff( 1 ) . dropna( ) . head( 1 ) ) . reset_index( )
interval_buy[ 'date' ] = interval_buy[ 'date' ] . apply ( lambda x : x. days)
interval_buy. drop( 'level_1' , axis= 1 , inplace= True )
interval_buy. rename( columns= { 'date' : 'interval_buy' } , inplace= True )
labels = pd. merge( labels, interval_buy, how= 'left' , on= 'user_id' )
del df_interval_buy
gc. collect( )
是否浏览未下单
df_browse_buy = df. loc[ ( df[ 'behavior_type' ] == 1 ) | ( df[ 'behavior_type' ] == 4 ) , [ 'user_id' , 'item_id' , 'behavior_type' , 'time' ] ]
browse_not_buy = pd. pivot_table( df_browse_buy, index= [ 'user_id' , 'item_id' ] , columns= [ 'behavior_type' ] , values= [ 'time' ] , aggfunc= [ 'count' ] )
browse_not_buy. columns = [ 'browse' , 'buy' ]
browse_not_buy. fillna( 0 , inplace= True )
browse_not_buy[ 'browse_not_buy' ] = 0
browse_not_buy. loc[ ( browse_not_buy[ 'browse' ] > 0 ) & ( browse_not_buy[ 'buy' ] == 0 ) , 'browse_not_buy' ] = 1
browse_not_buy = browse_not_buy. groupby( 'user_id' ) [ 'browse_not_buy' ] . sum ( ) . reset_index( )
labels = pd. merge( labels, browse_not_buy, how= 'left' , on= 'user_id' )
labels[ 'browse_not_buy' ] = labels[ 'browse_not_buy' ] . apply ( lambda x: '是' if x> 0 else '否' )
是否加购未下单
df_cart_buy = df. loc[ ( df[ 'behavior_type' ] == 3 ) | ( df[ 'behavior_type' ] == 4 ) , [ 'user_id' , 'item_id' , 'behavior_type' , 'time' ] ]
cart_not_buy = pd. pivot_table( df_cart_buy, index= [ 'user_id' , 'item_id' ] , columns= [ 'behavior_type' ] , values= [ 'time' ] , aggfunc= [ 'count' ] )
cart_not_buy. columns = [ 'cart' , 'buy' ]
cart_not_buy. fillna( 0 , inplace= True )
cart_not_buy[ 'cart_not_buy' ] = 0
cart_not_buy. loc[ ( cart_not_buy[ 'cart' ] > 0 ) & ( cart_not_buy[ 'buy' ] == 0 ) , 'cart_not_buy' ] = 1
cart_not_buy = cart_not_buy. groupby( 'user_id' ) [ 'cart_not_buy' ] . sum ( ) . reset_index( )
labels = pd. merge( labels, cart_not_buy, how= 'left' , on= 'user_id' )
labels[ 'cart_not_buy' ] = labels[ 'cart_not_buy' ] . apply ( lambda x: '是' if x> 0 else '否' )
用户属性标签
是否复购用户
#分组,统计购买次数
buy_again = df[df['behavior_type']==4].groupby('user_id')['item_id'].count().reset_index()
#更新列名
buy_again.rename(columns={'item_id':'buy_again'},inplace=True)
labels = pd.merge(labels,buy_again,how='left',on='user_id')
#修改原对象,用-1填充
labels['buy_again'].fillna(-1,inplace=True)
访问活跃度
user_active_level = labels[ 'counts_30_active' ] . value_counts( ) . sort_index( ascending= False )
plt. figure( figsize= ( 16 , 9 ) )
user_active_level. plot( title= '30天内访问次数与访问人数的关系' , fontsize= 18 )
plt. ylabel( '访问人数' , fontsize= 14 )
plt. xlabel( '访问次数' , fontsize= 14 )
labels[ 'user_active_level' ] = '高'
labels. loc[ labels[ 'counts_30_active' ] <= 16 , 'user_active_level' ] = '低'
购买活跃度
buy_active_level = labels[ 'counts_30_buy' ] . value_counts( ) . sort_index( ascending= False )
plt. figure( figsize= ( 16 , 9 ) )
buy_active_level. plot( title= '30天内购买次数与购买人数的关系' , fontsize= 18 )
plt. ylabel( '购买人数' , fontsize= 14 )
plt. xlabel( '购买次数' , fontsize= 14 )
labels[ 'buy_active_level' ] = '高'
labels. loc[ labels[ 'counts_30_buy' ] <= 14 , 'buy_active_level' ] = '低'
购买的品类是否单一
buy_single = df[ df[ 'behavior_type' ] == 4 ] . groupby( 'user_id' ) . item_category. nunique( ) . reset_index( )
buy_single. rename( columns= { 'item_category' : 'buy_single' } , inplace= True )
labels = pd. merge( labels, buy_single, how= 'left' , on= 'user_id' )
labels[ 'buy_single' ] . fillna( - 1 , inplace= True )
labels[ 'buy_single' ] = labels[ 'buy_single' ] . apply ( lambda x: '是' if x> 1 else '否' if x== 1 else '未购买' )
用户价值分组
last_buy_days = labels[ 'days_buy' ] . value_counts( ) . sort_index( )
plt. figure( figsize= ( 16 , 9 ) )
last_buy_days. plot( title= '最后一次购买距今天数与购买人数的关系' , fontsize= 18 )
plt. ylabel( '购买人数' , fontsize= 14 )
plt. xlabel( '距今天数' , fontsize= 14 )
labels[ 'buy_days_level' ] = '高'
labels. loc[ labels[ 'days_buy' ] > 8 , 'buy_days_level' ] = '低'
labels[ 'rfm_value' ] = labels[ 'buy_active_level' ] . str . cat( labels[ 'buy_days_level' ] )
def trans_value ( x) :
if x == '高高' :
return '重要价值客户'
elif x == '低高' :
return '重要深耕客户'
elif x == '高低' :
return '重要唤回客户'
else :
return '即将流失客户'
labels[ 'rfm' ] = labels[ 'rfm_value' ] . apply ( trans_value)
labels. drop( [ 'buy_days_level' , 'rfm_value' ] , axis= 1 , inplace= True )
labels[ 'rfm' ] . value_counts( )