产品、用户数据分析

# 导入相关包跟模块
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x : '%.2f' % x)  # pandas禁用科学计数法
# 读取数据
data = pd.read_csv('data.csv', index_col=0)  # 不需要第一列
data.shape
(564169, 11)
# 查看字段
data.columns
Index(['event_time', 'order_id', 'product_id', 'category_id', 'category_code',
       'brand', 'price', 'user_id', 'age', 'sex', 'local'],
      dtype='object')
# 数据预览
data.head()
event_timeorder_idproduct_idcategory_idcategory_codebrandpriceuser_idagesexlocal
02020-04-24 11:50:39 UTC229435993205453698615159662235090899062268105426648171520.00electronics.tabletsamsung162.011515915625441993984.0024.00海南
12020-04-24 11:50:39 UTC229435993205453698615159662235090899062268105426648171520.00electronics.tabletsamsung162.011515915625441993984.0024.00海南
22020-04-24 14:37:43 UTC229444402405808622022739483190571836582268105430162997248.00electronics.audio.headphonehuawei77.521515915625447879424.0038.00北京
32020-04-24 14:37:43 UTC229444402405808622022739483190571836582268105430162997248.00electronics.audio.headphonehuawei77.521515915625447879424.0038.00北京
42020-04-24 19:16:21 UTC229458426315407423622739483168174244392268105471367840000.00NaNkarcher217.571515915625443148032.0032.00广东
# 数据概览
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 564169 entries, 0 to 2633520
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     564169 non-null  object 
 1   order_id       564169 non-null  int64  
 2   product_id     564169 non-null  int64  
 3   category_id    564169 non-null  float64
 4   category_code  434799 non-null  object 
 5   brand          536945 non-null  object 
 6   price          564169 non-null  float64
 7   user_id        564169 non-null  float64
 8   age            564169 non-null  float64
 9   sex            564169 non-null  object 
 10  local          564169 non-null  object 
dtypes: float64(4), int64(2), object(5)
memory usage: 51.7+ MB

数据预处理

  • 缺失值
  • 重复值
  • 异常值
# 处理日期列
data['date'] = data.event_time.apply(lambda x: x.split(' ')[0])
data.head()
event_timeorder_idproduct_idcategory_idcategory_codebrandpriceuser_idagesexlocaldate
02020-04-24 11:50:39 UTC229435993205453698615159662235090899062268105426648171520.00electronics.tabletsamsung162.011515915625441993984.0024.00海南2020-04-24
12020-04-24 11:50:39 UTC229435993205453698615159662235090899062268105426648171520.00electronics.tabletsamsung162.011515915625441993984.0024.00海南2020-04-24
22020-04-24 14:37:43 UTC229444402405808622022739483190571836582268105430162997248.00electronics.audio.headphonehuawei77.521515915625447879424.0038.00北京2020-04-24
32020-04-24 14:37:43 UTC229444402405808622022739483190571836582268105430162997248.00electronics.audio.headphonehuawei77.521515915625447879424.0038.00北京2020-04-24
42020-04-24 19:16:21 UTC229458426315407423622739483168174244392268105471367840000.00NaNkarcher217.571515915625443148032.0032.00广东2020-04-24
# 转化为日期格式
data['date'] = pd.to_datetime(data['date'])
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 564169 entries, 0 to 2633520
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   event_time     564169 non-null  object        
 1   order_id       564169 non-null  int64         
 2   product_id     564169 non-null  int64         
 3   category_id    564169 non-null  float64       
 4   category_code  434799 non-null  object        
 5   brand          536945 non-null  object        
 6   price          564169 non-null  float64       
 7   user_id        564169 non-null  float64       
 8   age            564169 non-null  float64       
 9   sex            564169 non-null  object        
 10  local          564169 non-null  object        
 11  date           564169 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(2), object(5)
memory usage: 56.0+ MB
# 提取月份
data['month'] = data.date.dt.month
data.head()
event_timeorder_idproduct_idcategory_idcategory_codebrandpriceuser_idagesexlocaldatemonth
02020-04-24 11:50:39 UTC229435993205453698615159662235090899062268105426648171520.00electronics.tabletsamsung162.011515915625441993984.0024.00海南2020-04-244
12020-04-24 11:50:39 UTC229435993205453698615159662235090899062268105426648171520.00electronics.tabletsamsung162.011515915625441993984.0024.00海南2020-04-244
22020-04-24 14:37:43 UTC229444402405808622022739483190571836582268105430162997248.00electronics.audio.headphonehuawei77.521515915625447879424.0038.00北京2020-04-244
32020-04-24 14:37:43 UTC229444402405808622022739483190571836582268105430162997248.00electronics.audio.headphonehuawei77.521515915625447879424.0038.00北京2020-04-244
42020-04-24 19:16:21 UTC229458426315407423622739483168174244392268105471367840000.00NaNkarcher217.571515915625443148032.0032.00广东2020-04-244
# 提取星期几,0为周一,6为星期六
# data['weekday1'] = data.date.dt.weekday
# data.head()
# 0为星期日,1为星期一,2为星期二
data['weekday'] = data.date.apply(lambda x: x.strftime("%w"))
data.head()
event_timeorder_idproduct_idcategory_idcategory_codebrandpriceuser_idagesexlocaldatemonthweekday
02020-04-24 11:50:39 UTC229435993205453698615159662235090899062268105426648171520.00electronics.tabletsamsung162.011515915625441993984.0024.00海南2020-04-2445
12020-04-24 11:50:39 UTC229435993205453698615159662235090899062268105426648171520.00electronics.tabletsamsung162.011515915625441993984.0024.00海南2020-04-2445
22020-04-24 14:37:43 UTC229444402405808622022739483190571836582268105430162997248.00electronics.audio.headphonehuawei77.521515915625447879424.0038.00北京2020-04-2445
32020-04-24 14:37:43 UTC229444402405808622022739483190571836582268105430162997248.00electronics.audio.headphonehuawei77.521515915625447879424.0038.00北京2020-04-2445
42020-04-24 19:16:21 UTC229458426315407423622739483168174244392268105471367840000.00NaNkarcher217.571515915625443148032.0032.00广东2020-04-2445
# 删除原来字符时间列
# data.drop(labels=['event_time'], inplace=True)
del data['event_time']
data.head()
order_idproduct_idcategory_idcategory_codebrandpriceuser_idagesexlocaldatemonthweekday
0229435993205453698615159662235090899062268105426648171520.00electronics.tabletsamsung162.011515915625441993984.0024.00海南2020-04-2445
1229435993205453698615159662235090899062268105426648171520.00electronics.tabletsamsung162.011515915625441993984.0024.00海南2020-04-2445
2229444402405808622022739483190571836582268105430162997248.00electronics.audio.headphonehuawei77.521515915625447879424.0038.00北京2020-04-2445
3229444402405808622022739483190571836582268105430162997248.00electronics.audio.headphonehuawei77.521515915625447879424.0038.00北京2020-04-2445
4229458426315407423622739483168174244392268105471367840000.00NaNkarcher217.571515915625443148032.0032.00广东2020-04-2445
# 缺失值处理
data.isnull().sum() #  查看缺失值情况
# 发现数据中category_code(产品品类)和brand(品牌)两列存在数据缺失,
# 因为产品品类缺失的比较多,直接用‘M’来替代缺失值;而brand缺失比较少,
# 不会对结果产生影响,直接删除缺失值;
order_id              0
product_id            0
category_id           0
category_code    129370
brand             27224
price                 0
user_id               0
age                   0
sex                   0
local                 0
date                  0
month                 0
weekday               0
dtype: int64
# 用特殊字符“M”替换缺失值
data['category_code'] = data['category_code'].fillna('M')
# 丢掉brand字段为空的行
data = data[data.brand.notnull()]  #  直接过滤
data.isnull().sum()
order_id         0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
age              0
sex              0
local            0
date             0
month            0
weekday          0
dtype: int64
# 重复值处理
data.duplicated().sum()  # 查看重复行
# 存在634条重复数据,但是真实的销售数据中可能存在一个用户在同天下单两次或多次的情况,
# 所以这里的处理方法是增加购买数量和总价格的列,
# 而不对重复值直接删除处理
634
# 增加新列:购买数量
data = data.value_counts().reset_index().rename(columns={0: "buy_count"})
# 增加新列:购买总金额
data['buy_amount'] = data['price']*data['buy_count']
data.head()
order_idproduct_idcategory_idcategory_codebrandpriceuser_idagesexlocaldatemonthweekdaybuy_countbuy_amount
0231894587981116298323090182048333178162268105479144079872.00Mcompliment0.561515915625465863936.0028.00浙江2020-05-285442.24
1229574059474970222915159662235091048922268105428166508800.00electronics.smartphoneapple1387.011515915625448766464.0021.00北京2020-04-264045548.04
2238844098113467469815159662235091067572360741867017995776.00appliances.environment.air_conditionersamsung366.411515915625514599680.0050.00广东2020-11-1611141465.64
3237504333155506674022739483083700967642268105409048871168.00computers.network.routeraltel57.851515915625504379136.0019.00上海2020-08-13844231.40
4233499988703838308915159662235090900312268105402673529600.00Mvitek18.501515915625447765248.0018.00广东2020-06-1965355.50
# 异常值处理
data.describe(percentiles=[0.01, 0.25, 0.75, 0.99]).T
# 以上7个字段均无异常情况:price和amount最小值为0,
# 可能是免费商品或者赠品,不属于异常情况;
countmeanstdmin1%25%50%75%99%max
order_id536311.002370510904966508032.0020245175202374012.002294359932054536960.002305277023666307584.002353674406846267392.002376454570843832320.002388440981134596608.002388440981134690304.002388440981134693888.00
product_id536311.001692699904736436480.00327324678972381504.001515966223509088512.001515966223509088512.001515966223509104896.001515966223509261824.001515966223527326208.002388434452475807744.002388434452476881920.00
category_id536311.002273068434487365888.0021891838583158944.002268105388421284352.002268105388991709952.002268105406549066752.002268105428166508800.002268105439323357952.002374498914000592384.002374498914001945600.00
price536311.00214.54305.980.001.1324.5199.51289.331387.0111574.05
user_id536311.001515915625493883648.0024990827.601515915625439951872.001515915625441152000.001515915625467037184.001515915625486696704.001515915625511521280.001515915625514800128.001515915625514891264.00
age536311.0033.1810.1216.0016.0024.0033.0042.0050.0050.00
month536311.007.722.561.001.006.008.0010.0011.0011.00
buy_count536311.001.000.041.001.001.001.001.001.004.00
buy_amount536311.00214.73306.480.001.1324.9899.51289.331387.0111574.05
# 同时查看其它数据是否存在异常情况
data.describe(include='all').T
# 发现日期中存在“1970-01-01”的数据,应该是异常数据,予以删除;
countuniquetopfreqfirstlastmeanstdmin25%50%75%max
order_id536311.00NaNNaNNaNNaTNaT2370510904966508032.0020245175202374012.002294359932054536960.002353674406846267392.002376454570843832320.002388440981134596608.002388440981134693888.00
product_id536311.00NaNNaNNaNNaTNaT1692699904736436480.00327324678972381504.001515966223509088512.001515966223509104896.001515966223509261824.001515966223527326208.002388434452476881920.00
category_id536311.00NaNNaNNaNNaTNaT2273068434487365888.0021891838583158944.002268105388421284352.002268105406549066752.002268105428166508800.002268105439323357952.002374498914001945600.00
category_code536311124M116093NaTNaTNaNNaNNaNNaNNaNNaNNaN
brand536311868samsung96123NaTNaTNaNNaNNaNNaNNaNNaNNaN
price536311.00NaNNaNNaNNaTNaT214.54305.980.0024.5199.51289.3311574.05
user_id536311.00NaNNaNNaNNaTNaT1515915625493883648.0024990827.601515915625439951872.001515915625467037184.001515915625486696704.001515915625511521280.001515915625514891264.00
age536311.00NaNNaNNaNNaTNaT33.1810.1216.0024.0033.0042.0050.00
sex5363112270454NaTNaTNaNNaNNaNNaNNaNNaNNaN
local53631111广东117097NaTNaTNaNNaNNaNNaNNaNNaNNaN
date5363113232020-10-22 00:00:0083101970-01-012020-11-21NaNNaNNaNNaNNaNNaNNaN
month536311.00NaNNaNNaNNaTNaT7.722.561.006.008.0010.0011.00
weekday5363117686379NaTNaTNaNNaNNaNNaNNaNNaNNaN
buy_count536311.00NaNNaNNaNNaTNaT1.000.041.001.001.001.004.00
buy_amount536311.00NaNNaNNaNNaTNaT214.73306.480.0024.9899.51289.3311574.05
# 去掉日期异常值
data = data[data.date>'1970-01-01'] # 直接过滤掉
data.date.min()
Timestamp('2020-01-05 00:00:00')
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 535065 entries, 0 to 536310
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   order_id       535065 non-null  int64         
 1   product_id     535065 non-null  int64         
 2   category_id    535065 non-null  float64       
 3   category_code  535065 non-null  object        
 4   brand          535065 non-null  object        
 5   price          535065 non-null  float64       
 6   user_id        535065 non-null  float64       
 7   age            535065 non-null  float64       
 8   sex            535065 non-null  object        
 9   local          535065 non-null  object        
 10  date           535065 non-null  datetime64[ns]
 11  month          535065 non-null  int64         
 12  weekday        535065 non-null  object        
 13  buy_count      535065 non-null  int64         
 14  buy_amount     535065 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(4), object(5)
memory usage: 65.3+ MB
data.shape
(535065, 15)

数据分析

  • 基础维度分析:地域、年龄、性别、0元用户
  • 产品分析:销量前10、销售额前10、销量前5人群特征
  • 用户分析:生命周期模型、帕累托模型、RFM模型
# 地区维度销量
data_local = data.groupby('local')['user_id'].nunique().reset_index().sort_values('user_id', ascending=False)
data_local.rename(columns={'user_id':'用户数'}, inplace=True)
data_local
local用户数
4广东21382
0上海16031
1北京15928
5江苏5561
7海南5449
2四川5445
6浙江5370
8湖北5355
10重庆5342
3天津5337
9湖南5330
# 使用pyecharts进行可视化
from pyecharts import options as opts
from pyecharts.charts import Bar, Line
# 地域维度
bar = Bar()
bar.add_xaxis(data_local.local.tolist())
bar.add_yaxis('用户数',data_local['用户数'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="各地区域用户数量情况",subtitle="区域用户数"),  #  设置主副标题
                    xaxis_opts=opts.AxisOpts(name='区域', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts())
bar.render_notebook()
    <div id="dae8b16860cc447b9d61f32b0227b510" style="width:900px; height:500px;"></div>
# 性别维度
sex = data.groupby('sex')['user_id'].nunique().reset_index()
sex.rename(columns={'user_id':'用户数量'}, inplace=True)
sex
sex用户数量
047235
147628
bar = Bar()
bar.add_xaxis(sex.sex.tolist())
bar.add_yaxis('用户',sex['用户数量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="性别分类用户数情况",subtitle="用户数"),
                    xaxis_opts=opts.AxisOpts(name='性别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="da7684cee9fd43ff83621c244ab2c8f6" style="width:900px; height:500px;"></div>
# 年龄维度
age = data.groupby('age')['user_id'].nunique().reset_index()
age.rename(columns={'user_id':'用户数量'}, inplace=True)
age = age.sort_values(by='用户数量', ascending=False)
age
age用户数量
2743.002875
2844.002854
1632.002841
420.002838
1228.002825
723.002820
2238.002820
2339.002813
2036.002805
1329.002802
016.002797
1026.002793
622.002792
521.002786
2945.002784
3248.002781
1531.002772
925.002768
824.002764
1935.002764
2642.002763
218.002759
1430.002755
2541.002753
1733.002745
1834.002738
117.002725
1127.002724
3450.002724
3046.002713
2137.002710
3349.002707
319.002703
2440.002685
3147.002677
# 数据太多,直接可视化会看不清,做分箱处理
data_age = data.copy()
bins = [15, 20, 25, 30, 35, 40, 45, 50]
labels = ['(15-20]', '(20-25]', '(25-30]', '(30-35]', '(35-40]', '(40-45]', '(45-50]']
data_age['age_bin'] = pd.cut(x=data.age, bins=bins, labels=labels)

age = data_age.groupby('age_bin')['user_id'].nunique().reset_index()
age.rename(columns={'user_id':'用户数量'}, inplace=True)
age = age.sort_values(by='用户数量', ascending=False)
age
age_bin用户数量
5(40-45]13969
1(20-25]13867
2(25-30]13831
3(30-35]13802
4(35-40]13775
0(15-20]13726
6(45-50]13535
bar = Bar()
bar.add_xaxis(age.age_bin.tolist())
bar.add_yaxis('用户',age['用户数量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="各年龄用户数情况",subtitle="用户数"),
                    xaxis_opts=opts.AxisOpts(name='年龄段', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="4cf24a3b14b24b4e902b467ca391133d" style="width:900px; height:500px;"></div>
# 不同年龄段人群的消费差异
age_bin_data = data_age.groupby('age_bin').agg(消费金额=('buy_amount', 'sum'), 下单次数=('order_id', 'nunique'))
age_bin_data
消费金额下单次数
age_bin
(15-20]16007287.1854302
(20-25]16500743.7055546
(25-30]16513446.0054723
(30-35]17004578.7058275
(35-40]14995577.4449744
(40-45]17078724.5459067
(45-50]16886278.3257085
bar1 = (
    Bar()
    .add_xaxis(list(age_bin_data.index))
    .add_yaxis('消费金额', list(age_bin_data['消费金额']), yaxis_index=0)
    .extend_axis(yaxis=opts.AxisOpts(type_="value",
#                                      axistick_opts=opts.AxisTickOpts(is_show=True),
                                     splitline_opts=opts.SplitLineOpts(is_show=True),))  # 双坐标轴 这个必须添加,添加虚线
    .set_global_opts(title_opts=opts.TitleOpts(title='各年龄段消费金额及下单次数', subtitle='消费金额'))
)
bar2 = (
    Bar()
    .add_xaxis(list(age_bin_data.index))
    .add_yaxis('下单次数', list(age_bin_data['下单次数']),yaxis_index=1)
    .set_global_opts(title_opts=opts.TitleOpts(title='各年龄段消费金额及下单次数', subtitle='下单次数'))
)
bar1.overlap(bar2)
bar1.render_notebook()
    <div id="f0f2a988d9144ec1aeef7b7d28826084" style="width:900px; height:500px;"></div>
# 0元用户
user_0 = data[data['price']==0]['user_id'].reset_index(drop=True)
user_0.shape
(30,)
user_0[~user_0.isin(data[data['price']>0]['user_id'])]
# 30个0元用户中,只有一位是没有产生消费的,可能是中奖用户
21   1515915625468531712.00
Name: user_id, dtype: float64
# 销量前10的产品
top = data.groupby('product_id').agg(销售总量=('buy_count','sum')).reset_index().sort_values('销售总量', ascending=False).reset_index(drop=True).head(10)
top
product_id销售总量
015159662235178469282759
115159662235091067862597
215159662235090885322550
315159662235090886132549
415159662235090885672497
515159662235101745512370
615159662235090885212156
715159662235091041452037
815159662235090886391904
915159662235091170741797
bar = Bar()
bar.add_xaxis(top.product_id.tolist())
bar.add_yaxis('销量',top['销售总量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品销量前十", subtitle='产品销售量'),
                    xaxis_opts=opts.AxisOpts(name='产品', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="7038c9df402a44bfb1327f249699ac13" style="width:900px; height:500px;"></div>
# 销售前十的产品类别
category_top = data[data.category_code != "M"].groupby('category_code').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量', ascending=False).head(10)
category_top
category_code销量
88electronics.smartphone102169
58computers.notebook25860
30appliances.kitchen.refrigerators20020
81electronics.audio.headphone19739
92electronics.video.tv17623
13appliances.environment.vacuum15906
33appliances.kitchen.washer14163
25appliances.kitchen.kettle11869
63computers.peripherals.mouse10146
99furniture.kitchen.table9659
bar = Bar()
bar.add_xaxis(category_top['category_code'].tolist())
bar.add_yaxis('销量',category_top['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品类别销量前十", subtitle="类别销量"),
                    xaxis_opts=opts.AxisOpts(name='产品类别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="1f90c798f25c4c01a0db3a6d918d542a" style="width:900px; height:500px;"></div>
# 产品销售额情况
product_amount_top = data.groupby('product_id').agg(销售额=('buy_amount','sum')).reset_index().sort_values('销售额', ascending=False).head(10)
product_amount_top
product_id销售额
4615159662235090885672138006.31
9515159662235090886711540314.88
18315159662235090892841062128.50
6715159662235090886281007196.96
121515966223509088509921041.65
2511515966223509089438850236.39
261515966223509088532767295.00
10351515966223509104892760081.48
12211515966223509105893739893.40
721515966223509088639661068.80
bar = Bar()
bar.add_xaxis(product_amount_top.product_id.tolist())
bar.add_yaxis('销售额',product_amount_top['销售额'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品销售额情况", subtitle='销售额前十产品'),
                    xaxis_opts=opts.AxisOpts(name='产品', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销售额', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="f3a5b14750424d7a879434e0c06035f5" style="width:900px; height:500px;"></div>
# 产品类别销售额情况
category_amount_top = data[data.category_code != "M"].groupby('category_code').agg(销售额=('buy_amount','sum')).reset_index().sort_values('销售额', ascending=False).head(10)
category_amount_top
category_code销售额
88electronics.smartphone41214019.09
58computers.notebook14785855.65
92electronics.video.tv8724539.82
30appliances.kitchen.refrigerators8598364.87
33appliances.kitchen.washer4923616.33
13appliances.environment.vacuum2439553.99
89electronics.tablet2343150.48
87electronics.clocks2180030.73
9appliances.environment.air_conditioner1826364.85
23appliances.kitchen.hood1799268.89
bar = Bar()
bar.add_xaxis(category_amount_top.category_code.tolist())
bar.add_yaxis('销售额',category_amount_top['销售额'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品类别销售额情况", subtitle='销售额前十产品类别'),
                    xaxis_opts=opts.AxisOpts(name='产品类别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销售额', name_location='middle', name_gap=75),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="0cd0495fa04746748323be044d107b4c" style="width:900px; height:500px;"></div>
# 手机销量表现最好的前5个品牌
brand_top = data[data.category_code == 'electronics.smartphone'].groupby('brand').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量', ascending=False).head(5)
brand_top
brand销量
14samsung51376
1apple23365
18xiaomi8328
7huawei7738
12oppo6876
bar = Bar()
bar.add_xaxis(brand_top.brand.tolist())
bar.add_yaxis('销量',brand_top['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="手机品牌销量情况", subtitle='销量前5手机品牌'),
                    xaxis_opts=opts.AxisOpts(name='手机品牌', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="22e71d28a7cc4bc29596388da0bded6d" style="width:900px; height:500px;"></div>
#  三星手机用户的分析, 筛选手机品牌为  sumsung  且产品类型为  electronics.smartphone 
age_bin_sumsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('age_bin').agg(销量=('buy_count','sum')).reset_index().sort_values('销量',ascending=False)
age_bin_sumsung_phone
age_bin销量
5(40-45]7651
6(45-50]7640
3(30-35]7593
2(25-30]7320
0(15-20]7273
1(20-25]7125
4(35-40]6774
bar = Bar()
bar.add_xaxis(age_bin_sumsung_phone.age_bin.tolist())
bar.add_yaxis('销量',age_bin_sumsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各年龄段情况", subtitle="三星手机用户画像"),
                    xaxis_opts=opts.AxisOpts(name='年龄段', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts())
bar.render_notebook()
    <div id="d39450a08c40435d9334a37616a63d8e" style="width:900px; height:500px;"></div>
local_sumsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('local').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量',ascending=False)
local_sumsung_phone
local销量
4广东11491
0上海8635
1北京8356
2四川3231
9湖南3120
7海南2874
10重庆2851
6浙江2828
5江苏2717
3天津2684
8湖北2589
bar = Bar()
bar.add_xaxis(local_sumsung_phone.local.tolist())
bar.add_yaxis('销量',local_sumsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各区域销量情况", subtitle="区域用户画像"),
                    xaxis_opts=opts.AxisOpts(name='区域', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="591a52be7a6d47e6bd7c33646258990f" style="width:900px; height:500px;"></div>
sex_samsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('sex').agg(销量=('buy_count','sum')).reset_index()
sex_samsung_phone
sex销量
025477
125899
bar = Bar()
bar.add_xaxis(sex_samsung_phone.sex.tolist())
bar.add_yaxis('销量',sex_samsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各性别销量情况", subtitle="性别用户画像"),
                    xaxis_opts=opts.AxisOpts(name='性别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="54166c92e64b4d61ae7b039fbf56c68c" style="width:900px; height:500px;"></div>

用户生命周期模型

pivoted_amount = data[data['buy_amount']>0].pivot_table(index='user_id', 
                                       columns='month',
                                      values='buy_count',
                                      aggfunc='sum').fillna(0)
pivoted_amount.head()
month1234567891011
user_id
1515915625439951872.000.000.000.000.000.000.001.000.000.000.000.00
1515915625440038400.000.000.000.000.000.000.000.000.001.001.000.00
1515915625440051712.000.000.000.000.000.000.000.000.000.003.0010.00
1515915625440099840.000.000.000.000.002.001.001.000.005.0010.002.00
1515915625440121600.000.000.000.000.001.000.001.000.000.000.000.00
columns_month = pivoted_amount.columns.astype('str')  # 必须把列名转为字符
pivoted_amount.columns = columns_month
pivoted_purchase = pivoted_amount.applymap(lambda x: 1 if x>0 else 0)
pivoted_purchase.head()
month1234567891011
user_id
1515915625439951872.0000000010000
1515915625440038400.0000000000110
1515915625440051712.0000000000011
1515915625440099840.0000001110111
1515915625440121600.0000001010000
def active_status(data):
    status =[]
    for i in range(11):
        #若本月没有消费
        if data[i] ==0:
            if len(status)>0:  #如果不是第一个月,
                if status[i-1]=='未注册': #如果上个月已经是未注册,那么本月也是未注册
                    status.append('未注册') 
                else:  #如果上月已注册,则本月为不活跃
                    status.append('不活跃')
            else:  #如果是第一个月
                status.append('未注册') #则未注册
        #若本月消费
        else:
            if len(status)==0: #如果是第一个月,则为新注册用户
                status.append('新客户')
            else:  #如果不是第一个月
                if status[i-1]=='不活跃':  #如果上月为不活跃,那么本月为回流
                    status.append('回流')
                elif status[i-1]=='未注册':  #如果上月为未注册,那么本月为新注册
                    status.append('新客户')
                else:  #如果上月为活跃,本月也为活跃
                    status.append('活跃')
    return pd.Series(status,index=columns_month)
pivoted_purchase_status = pivoted_purchase.apply(lambda x: active_status(x), axis=1)
pivoted_purchase_status.head()
month1234567891011
user_id
1515915625439951872.00未注册未注册未注册未注册未注册未注册新客户不活跃不活跃不活跃不活跃
1515915625440038400.00未注册未注册未注册未注册未注册未注册未注册未注册新客户活跃不活跃
1515915625440051712.00未注册未注册未注册未注册未注册未注册未注册未注册未注册新客户活跃
1515915625440099840.00未注册未注册未注册未注册新客户活跃活跃不活跃回流活跃活跃
1515915625440121600.00未注册未注册未注册未注册新客户不活跃回流不活跃不活跃不活跃不活跃
purchase_count = pivoted_purchase_status.apply(lambda x:x.value_counts())
# purchase_count.head()
# 去除未注册的数据行
purchase_count = purchase_count[purchase_count.index != "未注册"]
purchase_count = purchase_count.fillna(0)
purchase_count
month1234567891011
不活跃0.001190.00234636855852181182418740985646657552884256.00
回流0.000.002614321377125134624244419937402444.00
新客户1813.001613.00149141771391489502203622052816356072938.00
活跃0.00623.008198001865363943098765718249413116.00
purchase_count = purchase_count.T
purchase_count
不活跃回流新客户活跃
month
10.000.001813.000.00
21190.000.001613.00623.00
32346.00261.001491.00819.00
43685.00432.004177.00800.00
55852.001377.0013914.001865.00
618118.001251.008950.003639.00
724187.003462.0022036.004309.00
840985.004244.0022052.008765.00
964665.004199.008163.007182.00
1075528.003740.005607.004941.00
1184256.002444.002938.003116.00
line = Line()
line.add_xaxis(purchase_count.index.tolist())
line.add_yaxis('不活跃',purchase_count['不活跃'].tolist())
line.add_yaxis('活跃',purchase_count['活跃'].tolist())
line.add_yaxis('新客户',purchase_count['新客户'].tolist())
line.add_yaxis('回流',purchase_count['回流'].tolist())
line.set_global_opts(title_opts=opts.TitleOpts(title="用户生命周期分层情况", subtitle="用户分层"),
                    xaxis_opts=opts.AxisOpts(name='月份', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='分层', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
line.render_notebook()
    <div id="5314f0ff88d54f9b9d2dc2234bab163d" style="width:900px; height:500px;"></div>

帕累托模型

# 查询每个用户的消费金额以及累计消费金额
user_2_8 = data.groupby('user_id').agg(消费金额=('buy_amount', 'sum')).sort_values('消费金额', ascending=False).reset_index()
user_2_8['累计销售额'] = user_2_8['消费金额'].cumsum()
user_2_8.head()
user_id消费金额累计销售额
01515915625512422912.00160604.07160604.07
11515915625513695488.00158277.37318881.44
21515915625512377088.00149967.06468848.50
31515915625513577472.00135672.84604521.34
41515915625514597888.00133945.88738467.22
p = user_2_8['消费金额'].cumsum() / user_2_8['消费金额'].sum()
key = p[p>0.8].index[0]
key
25408
key / user_2_8.shape[0]
0.2739259339119185
line = Line()
line.add_xaxis(user_2_8.index.tolist())
line.add_yaxis('累计销售额',user_2_8['累计销售额'].tolist())
line.set_global_opts(title_opts=opts.TitleOpts(title="帕累托模型", subtitle="累计销售额"),
#                     xaxis_opts=opts.AxisOpts(name='', name_location='middle', name_gap=35),  #  设置X轴标签
#                     yaxis_opts=opts.AxisOpts(name='', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
line.render_notebook()
    <div id="6084df0633cf4a048896a735c59a7adf" style="width:900px; height:500px;"></div>
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

灯下夜无眠

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值