0、代码
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import os
import zipfile
def unzipDir(path):
"""解压"""
zip_file = zipfile.ZipFile(path)
try:
zip_file.extractall(os.path.split(path)[0])
except Exception as e:
raise e
path = r'D:\data_test\tianchi_mobile_recommend_train_user.zip'
unzipDir(path)
user_action = pd.read_csv(r'd:\data_test\tianchi_mobile_recommend_train_user.csv')
print(user_action.shape)
sample = user_action.sample(frac=0.05,random_state=5,axis=0)
sample.to_csv(r'D:\data_test\user_result.csv',encoding='utf_8_sig')
user_data = pd.read_csv(r'd:\data_test\user_result.csv')
print(user_data.shape)
print(user_data.head().append(user_data.tail()))
user_data.info()
del user_data['user_geohash']
del user_data['Unnamed: 0']
print(user_data.columns)
time = pd.to_datetime(user_data['time'])
user_data['hour'] = time.dt.hour
user_data['weekday'] = time.dt.weekday
user_data['date'] = time.dt.date
behavior = {
1:'pv',2:'collect',3:'cart',4:'buy'}
user_data['behavior_type'] = user_data['behavior_type'].apply(lambda x: behavior[x])
print(user_data.head())
total_liu = user_data.groupby(['behavior_type']).size()
pv = total_liu[3]
uv = user_data['user_id'].nunique()
user_pay = user_data[user_data['behavior_type'] == 'buy']['user_id'].unique()
pv_per_day = pv/user_data['date'].nunique()
pv_per_user = pv/uv
pv_pay = user_data[user_data['user_id'].isin(user_pay)]['behavior_type'].value_counts().pv
user_pay_rate = len(user_pay)/uv
pv_pay_rate = pv_pay/pv
pv_per_buy_user = pv_pay/len(user_pay)
print('总访问量为',total_liu)
print('总访客数为%i'%uv)
print('消费用户数为 %i'%len(user_pay))
print('消费用户访问量为 %i'%pv_pay)
print('日均访问量为 %.3f'%pv_per_day)
print('人均访问量为 %.3f'%pv_per_user)
print('消费用户人均访问量为 %.3f'%pv_per_buy_user)
print('消费用户数占比为 %.3f%%'%(user_pay_rate * 100))
print('消费用户访问量占比为 %.3f%%'%(pv_pay_rate * 100))
day_liu = user_data.groupby(['date','behavior_type']).size().unstack()
print(day_liu)
dayily_uv = user_data.groupby('date')['user_id'].\
apply(lambda x:x.drop_duplicates().count()).reset_index().rename(columns={
'user_id':'uv'})
print(dayily_uv)
dayily_uv.plot(figsize=(12,3))
for i in day_liu.columns:
plt.figure()
ax = day_liu[i].plot(figsize=(12,3))
ax.set_title(i+""+"day of analysis")
hour_liu = user_data.groupby(['hour','behavior_type']).size().unstack()
hour_uv = user_data.groupby('hour')['user_id'].apply(lambda x: x.drop_duplicates().count()).reset_index().rename(columns={
'user_id':'uv'})
hour_uv.plot(figsize = (12,3))
for i in hour_liu.columns:
plt.figure()
ax = hour_liu[i].plot(figsize = (12,3))
ax.set_title(i+" "+"hour of analysis")
pv_to_car_collect = (total_liu.cart + total_liu.collect)/total_liu.pv*100
car_collect_to_buy = total_liu.buy/(total_liu.cart + total_liu.collect)*100
pv_to_buy = (total_liu.buy/total_liu.pv)*100
print("浏览到收藏加入购物车的转化率为 %.3f%%"%pv_to_car_collect)
print("购物车收藏到购买的转化率为 %.3f%%"%car_collect_to_buy)
print("浏览到购买转化率 %.3f%%"%pv_to_buy)
df_con = user_data[['user_id','item_id','time','behavior_type']]
loudou = df_con['behavior_type'].value_counts().reset_index()
loudou.loc[1,:]='collect+cart',loudou.loc[1,'behavior_type']+loudou.loc[2,'behavior_type']
loudou.drop(index = 2,inplace = True)
orders = [0,1,2]
loudou.index = orders
print(loudou)
jieduan0=list(loudou['behavior_type'])
print(jieduan0)
jieduan1=[]
for j in range(len(jieduan0)):
if j==0:
jieduan1.append(jieduan0[j])
else:
jieduan1.append(jieduan0[j-1])
loudou['行为阶段1']=jieduan1
loudou['转化率']=loudou['behavior_type']/loudou['行为阶段1']
print(loudou)
from pyecharts import options as opts
from pyecharts.charts import Funnel
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
CurrentConfig.ONLINE_HOST='https://assets.pyecharts.org/assets/'
xinwei=list(loudou['index'])
zhuanhua=list(loudou['转化率'])
label=[]
for i in range(len(xinwei)):
l=xinwei[i]+str(round(zhuanhua[i]*100,2))+'%'
label.append(l)
c=(
Funnel()
.add('行为',[list(z) for z in zip(label,
list(loudou['behavior_type']))],
label_opts=opts.LabelOpts(is_show=True,position='inside'),
tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{b} : {c}%")
)
.set_global_opts(title_opts=opts.TitleOpts(
title="流量转化漏斗图"
)
)
)
c.render_notebook()
cos_rebuy = user_data[user_data.behavior_type=='buy'].groupby('user_id')['date'].apply(lambda x:len(x.unique())).rename('rebuy_count')
print('复购率为 :',round(cos_rebuy[cos_rebuy>=2].count()/cos_rebuy.count(),4))
attrition_rates=sum(user_data.groupby('user_id')['behavior_type'].count()==1)/(user_data['user_id'].nunique())
print('跳失率为 %.3f%%'%(attrition_rates * 100) )
user_liu = user_data.groupby(['user_id','behavior_type']).size().unstack()
user_liu.fillna(0)
user_liu=pd.crosstab(user_data["user_id"],user_data["behavior_type"])
print(user_liu)
total_buy_count=user_liu.buy.to_frame()
topbuyer10=total_buy_count.sort_values(by='buy',ascending=False)[:10]
print(topbuyer10)
topbuyer10.reset_index().style.bar(color='skyblue',subset=['buy'])
tbc_box = total_buy_count.reset_index()
fig, ax = plt.subplots(figsize=[16,6])
ax.set_yscale("log")
sns.countplot(x=tbc_box['buy'],data=tbc_box,palette='Set1')
for p in ax.patches:
ax.annotate('{:.2f}%'.format(100*p.get_height()/len(tbc_box['buy'])), (p.get_x() - 0.1, p.get_height()))
plt.title('用户消费总次数')
plt.show()
daily_active_user = user_data.groupby('date')['user_id'].nunique()
daily_buy_user = user_data[user_data['behavior_type'] == 'buy'].groupby('date')['user_id'].nunique()
proportion_of_buyer = daily_buy_user / daily_active_user
daily_buy_count = user_data[user_data['behavior_type'] == 'buy'].groupby('date')['behavior_type'].count()
consumption_per_buyer = daily_buy_count / daily_buy_user
pob_bar = (pd.merge(daily_active_user, daily_buy_user, on='date').reset_index()
.rename(columns={
'user_id_x': '日活跃人数', 'user_id_y': '日消费人数'})
.set_index('date').stack().reset_index().rename(columns={
'level_1': 'Variable', 0: 'Value'}))
pob_line = proportion_of_buyer.reset_index().rename(columns={
'user_id': 'Rate'})
fig1 = plt.figure(figsize=[25, 6])
ax1 = fig1.add_subplot(111)
ax2 = ax1.twinx()
sns.barplot(x='date', y='Value', hue='Variable', data=pob_bar, ax=ax1, alpha=0.8, palette='husl')
ax1.legend().set_title('')
ax1.legend().remove()
sns.pointplot(pob_line['date'], pob_line['Rate'], ax=ax2, markers='D', linestyles='--', color='teal')
x = list(range(0, 16))
for a, b in zip(x, pob_line['Rate']):
plt.text(a + 0.1, b + 0.001, '%.2f%%' % (b * 100), ha='center', va='bottom', fontsize=12)
fig1.legend(loc='upper center', ncol=2)
plt.title('日消费人数占比')
plt.show()
cpb_bar = (daily_buy_count.reset_index().rename(columns={
'behavior_type': 'Num'}))
cpb_line = (consumption_per_buyer.reset_index().rename(columns={
0: 'Frequency'}))
fig2 = plt.figure(figsize=[16, 6])
ax3 = fig2.add_subplot(111)
ax4 = ax3.twinx()
sns.barplot(x='date', y='Num', data=cpb_bar, ax=ax3, alpha=0.8, palette='pastel')
sns.pointplot(cpb_line['date'], cpb_line['Frequency'], ax=ax4, markers='D', linestyles='--', color='teal')
x = list(range(0, 16))
for a, b in zip(x, cpb_line['Frequency']):
plt.text(a + 0.1, b + 0.001, '%.2f' % b, ha='center', va='bottom', fontsize=12)
plt.title('消费用户日人均消费次数')
plt.show()
dau3_df = user_data.groupby(['date', 'user_id'])['behavior_type'].count().reset_index()
dau3_df = dau3_df[dau3_df['behavior_type'] >= 3]
dau3_num = dau3_df.groupby('date')['user_id'].nunique()
fig, ax = plt.subplots(figsize=[16, 6])
sns.pointplot(dau3_num.index, dau3_num.values, markers='D', linestyles='--', color='teal')
x = list(range(0, 16))
for a, b in zip(x, dau3_num.values):
plt.text(a + 0.1, b + 300, '%i' % b, ha='center', va='bottom', fontsize=14)
plt.title('每日高活跃用户数')
plt.show()
dau3_cumsum = dau3_df.groupby('user_id')['date'].count()
fig, ax = plt.subplots(figsize=[16, 6])
ax.set_yscale("log")
sns.countplot(dau3_cumsum.values, palette='Set1')
for p in ax.patches:
ax.annotate('{:.2f}%'.format(100 * p.get_height() / len(dau3_cumsum.values)),
(p.get_x() + 0.2, p.get_height() + 100))
plt.title('高活跃用户累计活跃天数分布')
plt.show()
from pyecharts import options as opts
from pyecharts.charts import Funnel
user_data['time'] = pd.to_datetime(user_data['time'], format='%Y-%m-%d %H:%M:%S')
df_con = user_data[['user_id', 'item_id', 'time', 'behavior_type']]
df_pv = df_con[df_con['behavior_type'] == 'pv']
df_collect = df_con[df_con['behavior_type'] == 'collect']
df_cart = df_con[df_con['behavior_type'] == 'cart']
df_buy = df_con[df_con['behavior_type'] == 'buy']
df_pv_uid = df_con[df_con['behavior_type'] == 'pv']['user_id'].unique()
df_collect_uid = df_con[df_con['behavior_type'] == 'collect']['user_id'].unique()
df_cart_uid = df_con[df_con['behavior_type'] == 'cart']['user_id'].unique()
df_buy_uid = df_con[df_con['behavior_type'] == 'buy']['user_id'].unique()
collect_cart_list = set(df_collect_uid) | set(df_cart_uid)
pv_buy_df = pd.merge(left=df_pv, right=df_buy, how='inner', on=['user_id', 'item_id'], suffixes=('_pv', '_pay'))
pv_buy_df = pv_buy_df[(~pv_buy_df['user_id'].isin(collect_cart_list)) & (pv_buy_df['time_pv'] < pv_buy_df['time_pay'])]
uv = user_data['user_id'].nunique()
pv_buy_num = pv_buy_df['user_id'].nunique()
pv_buy_data = pd.DataFrame({
'type': ['浏览', '付款'], 'num': [uv, pv_buy_num]})
pv_buy_data['conversion_rates'] = (round((pv_buy_data['num'] / pv_buy_data['num']