产品、用户数据分析

最新推荐文章于 2024-03-13 10:39:52 发布

灯下夜无眠

最新推荐文章于 2024-03-13 10:39:52 发布

阅读量344

点赞数

文章标签： python 数据分析用户分析

本文链接：https://blog.csdn.net/llmuzi123456789/article/details/125228407

版权

# 导入相关包跟模块
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x : '%.2f' % x)  # pandas禁用科学计数法

# 读取数据
data = pd.read_csv('data.csv', index_col=0)  # 不需要第一列
data.shape

(564169, 11)

# 查看字段
data.columns

Index(['event_time', 'order_id', 'product_id', 'category_id', 'category_code',
       'brand', 'price', 'user_id', 'age', 'sex', 'local'],
      dtype='object')

# 数据预览
data.head()

	event_time	order_id	product_id	category_id	category_code	brand	price	user_id	age	sex	local
0	2020-04-24 11:50:39 UTC	2294359932054536986	1515966223509089906	2268105426648171520.00	electronics.tablet	samsung	162.01	1515915625441993984.00	24.00	女	海南
1	2020-04-24 11:50:39 UTC	2294359932054536986	1515966223509089906	2268105426648171520.00	electronics.tablet	samsung	162.01	1515915625441993984.00	24.00	女	海南
2	2020-04-24 14:37:43 UTC	2294444024058086220	2273948319057183658	2268105430162997248.00	electronics.audio.headphone	huawei	77.52	1515915625447879424.00	38.00	女	北京
3	2020-04-24 14:37:43 UTC	2294444024058086220	2273948319057183658	2268105430162997248.00	electronics.audio.headphone	huawei	77.52	1515915625447879424.00	38.00	女	北京
4	2020-04-24 19:16:21 UTC	2294584263154074236	2273948316817424439	2268105471367840000.00	NaN	karcher	217.57	1515915625443148032.00	32.00	女	广东

# 数据概览
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 564169 entries, 0 to 2633520
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     564169 non-null  object 
 1   order_id       564169 non-null  int64  
 2   product_id     564169 non-null  int64  
 3   category_id    564169 non-null  float64
 4   category_code  434799 non-null  object 
 5   brand          536945 non-null  object 
 6   price          564169 non-null  float64
 7   user_id        564169 non-null  float64
 8   age            564169 non-null  float64
 9   sex            564169 non-null  object 
 10  local          564169 non-null  object 
dtypes: float64(4), int64(2), object(5)
memory usage: 51.7+ MB

数据预处理

缺失值
重复值
异常值

# 处理日期列
data['date'] = data.event_time.apply(lambda x: x.split(' ')[0])
data.head()

	event_time	order_id	product_id	category_id	category_code	brand	price	user_id	age	sex	local	date
0	2020-04-24 11:50:39 UTC	2294359932054536986	1515966223509089906	2268105426648171520.00	electronics.tablet	samsung	162.01	1515915625441993984.00	24.00	女	海南	2020-04-24
1	2020-04-24 11:50:39 UTC	2294359932054536986	1515966223509089906	2268105426648171520.00	electronics.tablet	samsung	162.01	1515915625441993984.00	24.00	女	海南	2020-04-24
2	2020-04-24 14:37:43 UTC	2294444024058086220	2273948319057183658	2268105430162997248.00	electronics.audio.headphone	huawei	77.52	1515915625447879424.00	38.00	女	北京	2020-04-24
3	2020-04-24 14:37:43 UTC	2294444024058086220	2273948319057183658	2268105430162997248.00	electronics.audio.headphone	huawei	77.52	1515915625447879424.00	38.00	女	北京	2020-04-24
4	2020-04-24 19:16:21 UTC	2294584263154074236	2273948316817424439	2268105471367840000.00	NaN	karcher	217.57	1515915625443148032.00	32.00	女	广东	2020-04-24

# 转化为日期格式
data['date'] = pd.to_datetime(data['date'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 564169 entries, 0 to 2633520
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   event_time     564169 non-null  object        
 1   order_id       564169 non-null  int64         
 2   product_id     564169 non-null  int64         
 3   category_id    564169 non-null  float64       
 4   category_code  434799 non-null  object        
 5   brand          536945 non-null  object        
 6   price          564169 non-null  float64       
 7   user_id        564169 non-null  float64       
 8   age            564169 non-null  float64       
 9   sex            564169 non-null  object        
 10  local          564169 non-null  object        
 11  date           564169 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(2), object(5)
memory usage: 56.0+ MB

# 提取月份
data['month'] = data.date.dt.month
data.head()

	event_time	order_id	product_id	category_id	category_code	brand	price	user_id	age	sex	local	date	month
0	2020-04-24 11:50:39 UTC	2294359932054536986	1515966223509089906	2268105426648171520.00	electronics.tablet	samsung	162.01	1515915625441993984.00	24.00	女	海南	2020-04-24	4
1	2020-04-24 11:50:39 UTC	2294359932054536986	1515966223509089906	2268105426648171520.00	electronics.tablet	samsung	162.01	1515915625441993984.00	24.00	女	海南	2020-04-24	4
2	2020-04-24 14:37:43 UTC	2294444024058086220	2273948319057183658	2268105430162997248.00	electronics.audio.headphone	huawei	77.52	1515915625447879424.00	38.00	女	北京	2020-04-24	4
3	2020-04-24 14:37:43 UTC	2294444024058086220	2273948319057183658	2268105430162997248.00	electronics.audio.headphone	huawei	77.52	1515915625447879424.00	38.00	女	北京	2020-04-24	4
4	2020-04-24 19:16:21 UTC	2294584263154074236	2273948316817424439	2268105471367840000.00	NaN	karcher	217.57	1515915625443148032.00	32.00	女	广东	2020-04-24	4

# 提取星期几，0为周一，6为星期六
# data['weekday1'] = data.date.dt.weekday
# data.head()

# 0为星期日，1为星期一，2为星期二
data['weekday'] = data.date.apply(lambda x: x.strftime("%w"))
data.head()

	event_time	order_id	product_id	category_id	category_code	brand	price	user_id	age	sex	local	date	month	weekday
0	2020-04-24 11:50:39 UTC	2294359932054536986	1515966223509089906	2268105426648171520.00	electronics.tablet	samsung	162.01	1515915625441993984.00	24.00	女	海南	2020-04-24	4	5
1	2020-04-24 11:50:39 UTC	2294359932054536986	1515966223509089906	2268105426648171520.00	electronics.tablet	samsung	162.01	1515915625441993984.00	24.00	女	海南	2020-04-24	4	5
2	2020-04-24 14:37:43 UTC	2294444024058086220	2273948319057183658	2268105430162997248.00	electronics.audio.headphone	huawei	77.52	1515915625447879424.00	38.00	女	北京	2020-04-24	4	5
3	2020-04-24 14:37:43 UTC	2294444024058086220	2273948319057183658	2268105430162997248.00	electronics.audio.headphone	huawei	77.52	1515915625447879424.00	38.00	女	北京	2020-04-24	4	5
4	2020-04-24 19:16:21 UTC	2294584263154074236	2273948316817424439	2268105471367840000.00	NaN	karcher	217.57	1515915625443148032.00	32.00	女	广东	2020-04-24	4	5

# 删除原来字符时间列
# data.drop(labels=['event_time'], inplace=True)
del data['event_time']
data.head()

	order_id	product_id	category_id	category_code	brand	price	user_id	age	sex	local	date	month	weekday
0	2294359932054536986	1515966223509089906	2268105426648171520.00	electronics.tablet	samsung	162.01	1515915625441993984.00	24.00	女	海南	2020-04-24	4	5
1	2294359932054536986	1515966223509089906	2268105426648171520.00	electronics.tablet	samsung	162.01	1515915625441993984.00	24.00	女	海南	2020-04-24	4	5
2	2294444024058086220	2273948319057183658	2268105430162997248.00	electronics.audio.headphone	huawei	77.52	1515915625447879424.00	38.00	女	北京	2020-04-24	4	5
3	2294444024058086220	2273948319057183658	2268105430162997248.00	electronics.audio.headphone	huawei	77.52	1515915625447879424.00	38.00	女	北京	2020-04-24	4	5
4	2294584263154074236	2273948316817424439	2268105471367840000.00	NaN	karcher	217.57	1515915625443148032.00	32.00	女	广东	2020-04-24	4	5

# 缺失值处理
data.isnull().sum() #  查看缺失值情况
# 发现数据中category_code（产品品类）和brand（品牌）两列存在数据缺失，
# 因为产品品类缺失的比较多，直接用‘M’来替代缺失值；而brand缺失比较少，
# 不会对结果产生影响，直接删除缺失值；

order_id              0
product_id            0
category_id           0
category_code    129370
brand             27224
price                 0
user_id               0
age                   0
sex                   0
local                 0
date                  0
month                 0
weekday               0
dtype: int64

# 用特殊字符“M”替换缺失值
data['category_code'] = data['category_code'].fillna('M')
# 丢掉brand字段为空的行
data = data[data.brand.notnull()]  #  直接过滤
data.isnull().sum()

order_id         0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
age              0
sex              0
local            0
date             0
month            0
weekday          0
dtype: int64

# 重复值处理
data.duplicated().sum()  # 查看重复行
# 存在634条重复数据，但是真实的销售数据中可能存在一个用户在同天下单两次或多次的情况，
# 所以这里的处理方法是增加购买数量和总价格的列，
# 而不对重复值直接删除处理

# 增加新列：购买数量
data = data.value_counts().reset_index().rename(columns={0: "buy_count"})
# 增加新列：购买总金额
data['buy_amount'] = data['price']*data['buy_count']
data.head()

	order_id	product_id	category_id	category_code	brand	price	user_id	age	sex	local	date	month	weekday	buy_count	buy_amount
0	2318945879811162983	2309018204833317816	2268105479144079872.00	M	compliment	0.56	1515915625465863936.00	28.00	女	浙江	2020-05-28	5	4	4	2.24
1	2295740594749702229	1515966223509104892	2268105428166508800.00	electronics.smartphone	apple	1387.01	1515915625448766464.00	21.00	男	北京	2020-04-26	4	0	4	5548.04
2	2388440981134674698	1515966223509106757	2360741867017995776.00	appliances.environment.air_conditioner	samsung	366.41	1515915625514599680.00	50.00	男	广东	2020-11-16	11	1	4	1465.64
3	2375043331555066740	2273948308370096764	2268105409048871168.00	computers.network.router	altel	57.85	1515915625504379136.00	19.00	女	上海	2020-08-13	8	4	4	231.40
4	2334999887038383089	1515966223509090031	2268105402673529600.00	M	vitek	18.50	1515915625447765248.00	18.00	男	广东	2020-06-19	6	5	3	55.50

# 异常值处理
data.describe(percentiles=[0.01, 0.25, 0.75, 0.99]).T
# 以上7个字段均无异常情况：price和amount最小值为0，
# 可能是免费商品或者赠品，不属于异常情况；

	count	mean	std	min	1%	25%	50%	75%	99%	max
order_id	536311.00	2370510904966508032.00	20245175202374012.00	2294359932054536960.00	2305277023666307584.00	2353674406846267392.00	2376454570843832320.00	2388440981134596608.00	2388440981134690304.00	2388440981134693888.00
product_id	536311.00	1692699904736436480.00	327324678972381504.00	1515966223509088512.00	1515966223509088512.00	1515966223509104896.00	1515966223509261824.00	1515966223527326208.00	2388434452475807744.00	2388434452476881920.00
category_id	536311.00	2273068434487365888.00	21891838583158944.00	2268105388421284352.00	2268105388991709952.00	2268105406549066752.00	2268105428166508800.00	2268105439323357952.00	2374498914000592384.00	2374498914001945600.00
price	536311.00	214.54	305.98	0.00	1.13	24.51	99.51	289.33	1387.01	11574.05
user_id	536311.00	1515915625493883648.00	24990827.60	1515915625439951872.00	1515915625441152000.00	1515915625467037184.00	1515915625486696704.00	1515915625511521280.00	1515915625514800128.00	1515915625514891264.00
age	536311.00	33.18	10.12	16.00	16.00	24.00	33.00	42.00	50.00	50.00
month	536311.00	7.72	2.56	1.00	1.00	6.00	8.00	10.00	11.00	11.00
buy_count	536311.00	1.00	0.04	1.00	1.00	1.00	1.00	1.00	1.00	4.00
buy_amount	536311.00	214.73	306.48	0.00	1.13	24.98	99.51	289.33	1387.01	11574.05

# 同时查看其它数据是否存在异常情况
data.describe(include='all').T
# 发现日期中存在“1970-01-01”的数据，应该是异常数据，予以删除；

	count	unique	top	freq	first	last	mean	std	min	25%	50%	75%	max
order_id	536311.00	NaN	NaN	NaN	NaT	NaT	2370510904966508032.00	20245175202374012.00	2294359932054536960.00	2353674406846267392.00	2376454570843832320.00	2388440981134596608.00	2388440981134693888.00
product_id	536311.00	NaN	NaN	NaN	NaT	NaT	1692699904736436480.00	327324678972381504.00	1515966223509088512.00	1515966223509104896.00	1515966223509261824.00	1515966223527326208.00	2388434452476881920.00
category_id	536311.00	NaN	NaN	NaN	NaT	NaT	2273068434487365888.00	21891838583158944.00	2268105388421284352.00	2268105406549066752.00	2268105428166508800.00	2268105439323357952.00	2374498914001945600.00
category_code	536311	124	M	116093	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN
brand	536311	868	samsung	96123	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN
price	536311.00	NaN	NaN	NaN	NaT	NaT	214.54	305.98	0.00	24.51	99.51	289.33	11574.05
user_id	536311.00	NaN	NaN	NaN	NaT	NaT	1515915625493883648.00	24990827.60	1515915625439951872.00	1515915625467037184.00	1515915625486696704.00	1515915625511521280.00	1515915625514891264.00
age	536311.00	NaN	NaN	NaN	NaT	NaT	33.18	10.12	16.00	24.00	33.00	42.00	50.00
sex	536311	2	男	270454	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN
local	536311	11	广东	117097	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN
date	536311	323	2020-10-22 00:00:00	8310	1970-01-01	2020-11-21	NaN	NaN	NaN	NaN	NaN	NaN	NaN
month	536311.00	NaN	NaN	NaN	NaT	NaT	7.72	2.56	1.00	6.00	8.00	10.00	11.00
weekday	536311	7	6	86379	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN
buy_count	536311.00	NaN	NaN	NaN	NaT	NaT	1.00	0.04	1.00	1.00	1.00	1.00	4.00
buy_amount	536311.00	NaN	NaN	NaN	NaT	NaT	214.73	306.48	0.00	24.98	99.51	289.33	11574.05

# 去掉日期异常值
data = data[data.date>'1970-01-01'] # 直接过滤掉
data.date.min()

Timestamp('2020-01-05 00:00:00')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 535065 entries, 0 to 536310
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   order_id       535065 non-null  int64         
 1   product_id     535065 non-null  int64         
 2   category_id    535065 non-null  float64       
 3   category_code  535065 non-null  object        
 4   brand          535065 non-null  object        
 5   price          535065 non-null  float64       
 6   user_id        535065 non-null  float64       
 7   age            535065 non-null  float64       
 8   sex            535065 non-null  object        
 9   local          535065 non-null  object        
 10  date           535065 non-null  datetime64[ns]
 11  month          535065 non-null  int64         
 12  weekday        535065 non-null  object        
 13  buy_count      535065 non-null  int64         
 14  buy_amount     535065 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(4), object(5)
memory usage: 65.3+ MB

data.shape

(535065, 15)

数据分析

基础维度分析：地域、年龄、性别、0元用户
产品分析：销量前10、销售额前10、销量前5人群特征
用户分析：生命周期模型、帕累托模型、RFM模型

# 地区维度销量
data_local = data.groupby('local')['user_id'].nunique().reset_index().sort_values('user_id', ascending=False)
data_local.rename(columns={'user_id':'用户数'}, inplace=True)
data_local

	local	用户数
4	广东	21382
0	上海	16031
1	北京	15928
5	江苏	5561
7	海南	5449
2	四川	5445
6	浙江	5370
8	湖北	5355
10	重庆	5342
3	天津	5337
9	湖南	5330

# 使用pyecharts进行可视化
from pyecharts import options as opts
from pyecharts.charts import Bar, Line

# 地域维度
bar = Bar()
bar.add_xaxis(data_local.local.tolist())
bar.add_yaxis('用户数',data_local['用户数'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="各地区域用户数量情况",subtitle="区域用户数"),  #  设置主副标题
                    xaxis_opts=opts.AxisOpts(name='区域', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts())
bar.render_notebook()

    <div id="dae8b16860cc447b9d61f32b0227b510" style="width:900px; height:500px;"></div>

# 性别维度
sex = data.groupby('sex')['user_id'].nunique().reset_index()
sex.rename(columns={'user_id':'用户数量'}, inplace=True)
sex

	sex	用户数量
0	女	47235
1	男	47628

bar = Bar()
bar.add_xaxis(sex.sex.tolist())
bar.add_yaxis('用户',sex['用户数量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="性别分类用户数情况",subtitle="用户数"),
                    xaxis_opts=opts.AxisOpts(name='性别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()

    <div id="da7684cee9fd43ff83621c244ab2c8f6" style="width:900px; height:500px;"></div>

# 年龄维度
age = data.groupby('age')['user_id'].nunique().reset_index()
age.rename(columns={'user_id':'用户数量'}, inplace=True)
age = age.sort_values(by='用户数量', ascending=False)
age

	age	用户数量
27	43.00	2875
28	44.00	2854
16	32.00	2841
4	20.00	2838
12	28.00	2825
7	23.00	2820
22	38.00	2820
23	39.00	2813
20	36.00	2805
13	29.00	2802
0	16.00	2797
10	26.00	2793
6	22.00	2792
5	21.00	2786
29	45.00	2784
32	48.00	2781
15	31.00	2772
9	25.00	2768
8	24.00	2764
19	35.00	2764
26	42.00	2763
2	18.00	2759
14	30.00	2755
25	41.00	2753
17	33.00	2745
18	34.00	2738
1	17.00	2725
11	27.00	2724
34	50.00	2724
30	46.00	2713
21	37.00	2710
33	49.00	2707
3	19.00	2703
24	40.00	2685
31	47.00	2677

# 数据太多，直接可视化会看不清，做分箱处理
data_age = data.copy()
bins = [15, 20, 25, 30, 35, 40, 45, 50]
labels = ['(15-20]', '(20-25]', '(25-30]', '(30-35]', '(35-40]', '(40-45]', '(45-50]']
data_age['age_bin'] = pd.cut(x=data.age, bins=bins, labels=labels)

age = data_age.groupby('age_bin')['user_id'].nunique().reset_index()
age.rename(columns={'user_id':'用户数量'}, inplace=True)
age = age.sort_values(by='用户数量', ascending=False)
age

	age_bin	用户数量
5	(40-45]	13969
1	(20-25]	13867
2	(25-30]	13831
3	(30-35]	13802
4	(35-40]	13775
0	(15-20]	13726
6	(45-50]	13535

bar = Bar()
bar.add_xaxis(age.age_bin.tolist())
bar.add_yaxis('用户',age['用户数量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="各年龄用户数情况",subtitle="用户数"),
                    xaxis_opts=opts.AxisOpts(name='年龄段', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()

    <div id="4cf24a3b14b24b4e902b467ca391133d" style="width:900px; height:500px;"></div>

# 不同年龄段人群的消费差异
age_bin_data = data_age.groupby('age_bin').agg(消费金额=('buy_amount', 'sum'), 下单次数=('order_id', 'nunique'))
age_bin_data

	消费金额	下单次数
age_bin
(15-20]	16007287.18	54302
(20-25]	16500743.70	55546
(25-30]	16513446.00	54723
(30-35]	17004578.70	58275
(35-40]	14995577.44	49744
(40-45]	17078724.54	59067
(45-50]	16886278.32	57085

bar1 = (
    Bar()
    .add_xaxis(list(age_bin_data.index))
    .add_yaxis('消费金额', list(age_bin_data['消费金额']), yaxis_index=0)
    .extend_axis(yaxis=opts.AxisOpts(type_="value",
#                                      axistick_opts=opts.AxisTickOpts(is_show=True),
                                     splitline_opts=opts.SplitLineOpts(is_show=True),))  # 双坐标轴 这个必须添加,添加虚线
    .set_global_opts(title_opts=opts.TitleOpts(title='各年龄段消费金额及下单次数', subtitle='消费金额'))
)
bar2 = (
    Bar()
    .add_xaxis(list(age_bin_data.index))
    .add_yaxis('下单次数', list(age_bin_data['下单次数']),yaxis_index=1)
    .set_global_opts(title_opts=opts.TitleOpts(title='各年龄段消费金额及下单次数', subtitle='下单次数'))
)
bar1.overlap(bar2)
bar1.render_notebook()

    <div id="f0f2a988d9144ec1aeef7b7d28826084" style="width:900px; height:500px;"></div>

# 0元用户
user_0 = data[data['price']==0]['user_id'].reset_index(drop=True)
user_0.shape

(30,)

user_0[~user_0.isin(data[data['price']>0]['user_id'])]
# 30个0元用户中，只有一位是没有产生消费的，可能是中奖用户

21   1515915625468531712.00
Name: user_id, dtype: float64

# 销量前10的产品
top = data.groupby('product_id').agg(销售总量=('buy_count','sum')).reset_index().sort_values('销售总量', ascending=False).reset_index(drop=True).head(10)
top

	product_id	销售总量
0	1515966223517846928	2759
1	1515966223509106786	2597
2	1515966223509088532	2550
3	1515966223509088613	2549
4	1515966223509088567	2497
5	1515966223510174551	2370
6	1515966223509088521	2156
7	1515966223509104145	2037
8	1515966223509088639	1904
9	1515966223509117074	1797

bar = Bar()
bar.add_xaxis(top.product_id.tolist())
bar.add_yaxis('销量',top['销售总量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品销量前十", subtitle='产品销售量'),
                    xaxis_opts=opts.AxisOpts(name='产品', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()

    <div id="7038c9df402a44bfb1327f249699ac13" style="width:900px; height:500px;"></div>

# 销售前十的产品类别
category_top = data[data.category_code != "M"].groupby('category_code').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量', ascending=False).head(10)
category_top

	category_code	销量
88	electronics.smartphone	102169
58	computers.notebook	25860
30	appliances.kitchen.refrigerators	20020
81	electronics.audio.headphone	19739
92	electronics.video.tv	17623
13	appliances.environment.vacuum	15906
33	appliances.kitchen.washer	14163
25	appliances.kitchen.kettle	11869
63	computers.peripherals.mouse	10146
99	furniture.kitchen.table	9659

bar = Bar()
bar.add_xaxis(category_top['category_code'].tolist())
bar.add_yaxis('销量',category_top['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品类别销量前十", subtitle="类别销量"),
                    xaxis_opts=opts.AxisOpts(name='产品类别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()

    <div id="1f90c798f25c4c01a0db3a6d918d542a" style="width:900px; height:500px;"></div>

# 产品销售额情况
product_amount_top = data.groupby('product_id').agg(销售额=('buy_amount','sum')).reset_index().sort_values('销售额', ascending=False).head(10)
product_amount_top

	product_id	销售额
46	1515966223509088567	2138006.31
95	1515966223509088671	1540314.88
183	1515966223509089284	1062128.50
67	1515966223509088628	1007196.96
12	1515966223509088509	921041.65
251	1515966223509089438	850236.39
26	1515966223509088532	767295.00
1035	1515966223509104892	760081.48
1221	1515966223509105893	739893.40
72	1515966223509088639	661068.80

bar = Bar()
bar.add_xaxis(product_amount_top.product_id.tolist())
bar.add_yaxis('销售额',product_amount_top['销售额'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品销售额情况", subtitle='销售额前十产品'),
                    xaxis_opts=opts.AxisOpts(name='产品', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销售额', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()

    <div id="f3a5b14750424d7a879434e0c06035f5" style="width:900px; height:500px;"></div>

# 产品类别销售额情况
category_amount_top = data[data.category_code != "M"].groupby('category_code').agg(销售额=('buy_amount','sum')).reset_index().sort_values('销售额', ascending=False).head(10)
category_amount_top

	category_code	销售额
88	electronics.smartphone	41214019.09
58	computers.notebook	14785855.65
92	electronics.video.tv	8724539.82
30	appliances.kitchen.refrigerators	8598364.87
33	appliances.kitchen.washer	4923616.33
13	appliances.environment.vacuum	2439553.99
89	electronics.tablet	2343150.48
87	electronics.clocks	2180030.73
9	appliances.environment.air_conditioner	1826364.85
23	appliances.kitchen.hood	1799268.89

bar = Bar()
bar.add_xaxis(category_amount_top.category_code.tolist())
bar.add_yaxis('销售额',category_amount_top['销售额'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品类别销售额情况", subtitle='销售额前十产品类别'),
                    xaxis_opts=opts.AxisOpts(name='产品类别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销售额', name_location='middle', name_gap=75),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()

    <div id="0cd0495fa04746748323be044d107b4c" style="width:900px; height:500px;"></div>

# 手机销量表现最好的前5个品牌
brand_top = data[data.category_code == 'electronics.smartphone'].groupby('brand').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量', ascending=False).head(5)
brand_top

	brand	销量
14	samsung	51376
1	apple	23365
18	xiaomi	8328
7	huawei	7738
12	oppo	6876

bar = Bar()
bar.add_xaxis(brand_top.brand.tolist())
bar.add_yaxis('销量',brand_top['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="手机品牌销量情况", subtitle='销量前5手机品牌'),
                    xaxis_opts=opts.AxisOpts(name='手机品牌', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()

    <div id="22e71d28a7cc4bc29596388da0bded6d" style="width:900px; height:500px;"></div>

#  三星手机用户的分析， 筛选手机品牌为  sumsung  且产品类型为  electronics.smartphone 
age_bin_sumsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('age_bin').agg(销量=('buy_count','sum')).reset_index().sort_values('销量',ascending=False)
age_bin_sumsung_phone

	age_bin	销量
5	(40-45]	7651
6	(45-50]	7640
3	(30-35]	7593
2	(25-30]	7320
0	(15-20]	7273
1	(20-25]	7125
4	(35-40]	6774

bar = Bar()
bar.add_xaxis(age_bin_sumsung_phone.age_bin.tolist())
bar.add_yaxis('销量',age_bin_sumsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各年龄段情况", subtitle="三星手机用户画像"),
                    xaxis_opts=opts.AxisOpts(name='年龄段', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts())
bar.render_notebook()

    <div id="d39450a08c40435d9334a37616a63d8e" style="width:900px; height:500px;"></div>

local_sumsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('local').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量',ascending=False)
local_sumsung_phone

	local	销量
4	广东	11491
0	上海	8635
1	北京	8356
2	四川	3231
9	湖南	3120
7	海南	2874
10	重庆	2851
6	浙江	2828
5	江苏	2717
3	天津	2684
8	湖北	2589

bar = Bar()
bar.add_xaxis(local_sumsung_phone.local.tolist())
bar.add_yaxis('销量',local_sumsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各区域销量情况", subtitle="区域用户画像"),
                    xaxis_opts=opts.AxisOpts(name='区域', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()

    <div id="591a52be7a6d47e6bd7c33646258990f" style="width:900px; height:500px;"></div>

sex_samsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('sex').agg(销量=('buy_count','sum')).reset_index()
sex_samsung_phone

	sex	销量
0	女	25477
1	男	25899

bar = Bar()
bar.add_xaxis(sex_samsung_phone.sex.tolist())
bar.add_yaxis('销量',sex_samsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各性别销量情况", subtitle="性别用户画像"),
                    xaxis_opts=opts.AxisOpts(name='性别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()

    <div id="54166c92e64b4d61ae7b039fbf56c68c" style="width:900px; height:500px;"></div>

用户生命周期模型

pivoted_amount = data[data['buy_amount']>0].pivot_table(index='user_id', 
                                       columns='month',
                                      values='buy_count',
                                      aggfunc='sum').fillna(0)
pivoted_amount.head()

month	1	2	3	4	5	6	7	8	9	10	11
user_id
1515915625439951872.00	0.00	0.00	0.00	0.00	0.00	0.00	1.00	0.00	0.00	0.00	0.00
1515915625440038400.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	1.00	1.00	0.00
1515915625440051712.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	3.00	10.00
1515915625440099840.00	0.00	0.00	0.00	0.00	2.00	1.00	1.00	0.00	5.00	10.00	2.00
1515915625440121600.00	0.00	0.00	0.00	0.00	1.00	0.00	1.00	0.00	0.00	0.00	0.00

columns_month = pivoted_amount.columns.astype('str')  # 必须把列名转为字符
pivoted_amount.columns = columns_month
pivoted_purchase = pivoted_amount.applymap(lambda x: 1 if x>0 else 0)
pivoted_purchase.head()

month	1	2	3	4	5	6	7	8	9	10	11
user_id
1515915625439951872.00	0	0	0	0	0	0	1	0	0	0	0
1515915625440038400.00	0	0	0	0	0	0	0	0	1	1	0
1515915625440051712.00	0	0	0	0	0	0	0	0	0	1	1
1515915625440099840.00	0	0	0	0	1	1	1	0	1	1	1
1515915625440121600.00	0	0	0	0	1	0	1	0	0	0	0

def active_status(data):
    status =[]
    for i in range(11):
        #若本月没有消费
        if data[i] ==0:
            if len(status)>0:  #如果不是第一个月,
                if status[i-1]=='未注册': #如果上个月已经是未注册,那么本月也是未注册
                    status.append('未注册') 
                else:  #如果上月已注册,则本月为不活跃
                    status.append('不活跃')
            else:  #如果是第一个月
                status.append('未注册') #则未注册
        #若本月消费
        else:
            if len(status)==0: #如果是第一个月,则为新注册用户
                status.append('新客户')
            else:  #如果不是第一个月
                if status[i-1]=='不活跃':  #如果上月为不活跃,那么本月为回流
                    status.append('回流')
                elif status[i-1]=='未注册':  #如果上月为未注册,那么本月为新注册
                    status.append('新客户')
                else:  #如果上月为活跃,本月也为活跃
                    status.append('活跃')
    return pd.Series(status,index=columns_month)

pivoted_purchase_status = pivoted_purchase.apply(lambda x: active_status(x), axis=1)
pivoted_purchase_status.head()

month	1	2	3	4	5	6	7	8	9	10	11
user_id
1515915625439951872.00	未注册	未注册	未注册	未注册	未注册	未注册	新客户	不活跃	不活跃	不活跃	不活跃
1515915625440038400.00	未注册	未注册	未注册	未注册	未注册	未注册	未注册	未注册	新客户	活跃	不活跃
1515915625440051712.00	未注册	未注册	未注册	未注册	未注册	未注册	未注册	未注册	未注册	新客户	活跃
1515915625440099840.00	未注册	未注册	未注册	未注册	新客户	活跃	活跃	不活跃	回流	活跃	活跃
1515915625440121600.00	未注册	未注册	未注册	未注册	新客户	不活跃	回流	不活跃	不活跃	不活跃	不活跃

purchase_count = pivoted_purchase_status.apply(lambda x:x.value_counts())
# purchase_count.head()
# 去除未注册的数据行
purchase_count = purchase_count[purchase_count.index != "未注册"]
purchase_count = purchase_count.fillna(0)
purchase_count

month	1	2	3	4	5	6	7	8	9	10	11
不活跃	0.00	1190.00	2346	3685	5852	18118	24187	40985	64665	75528	84256.00
回流	0.00	0.00	261	432	1377	1251	3462	4244	4199	3740	2444.00
新客户	1813.00	1613.00	1491	4177	13914	8950	22036	22052	8163	5607	2938.00
活跃	0.00	623.00	819	800	1865	3639	4309	8765	7182	4941	3116.00

purchase_count = purchase_count.T
purchase_count

	不活跃	回流	新客户	活跃
month
1	0.00	0.00	1813.00	0.00
2	1190.00	0.00	1613.00	623.00
3	2346.00	261.00	1491.00	819.00
4	3685.00	432.00	4177.00	800.00
5	5852.00	1377.00	13914.00	1865.00
6	18118.00	1251.00	8950.00	3639.00
7	24187.00	3462.00	22036.00	4309.00
8	40985.00	4244.00	22052.00	8765.00
9	64665.00	4199.00	8163.00	7182.00
10	75528.00	3740.00	5607.00	4941.00
11	84256.00	2444.00	2938.00	3116.00

line = Line()
line.add_xaxis(purchase_count.index.tolist())
line.add_yaxis('不活跃',purchase_count['不活跃'].tolist())
line.add_yaxis('活跃',purchase_count['活跃'].tolist())
line.add_yaxis('新客户',purchase_count['新客户'].tolist())
line.add_yaxis('回流',purchase_count['回流'].tolist())
line.set_global_opts(title_opts=opts.TitleOpts(title="用户生命周期分层情况", subtitle="用户分层"),
                    xaxis_opts=opts.AxisOpts(name='月份', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='分层', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
line.render_notebook()

    <div id="5314f0ff88d54f9b9d2dc2234bab163d" style="width:900px; height:500px;"></div>

帕累托模型

# 查询每个用户的消费金额以及累计消费金额
user_2_8 = data.groupby('user_id').agg(消费金额=('buy_amount', 'sum')).sort_values('消费金额', ascending=False).reset_index()
user_2_8['累计销售额'] = user_2_8['消费金额'].cumsum()
user_2_8.head()

	user_id	消费金额	累计销售额
0	1515915625512422912.00	160604.07	160604.07
1	1515915625513695488.00	158277.37	318881.44
2	1515915625512377088.00	149967.06	468848.50
3	1515915625513577472.00	135672.84	604521.34
4	1515915625514597888.00	133945.88	738467.22

p = user_2_8['消费金额'].cumsum() / user_2_8['消费金额'].sum()
key = p[p>0.8].index[0]
key

key / user_2_8.shape[0]

0.2739259339119185

line = Line()
line.add_xaxis(user_2_8.index.tolist())
line.add_yaxis('累计销售额',user_2_8['累计销售额'].tolist())
line.set_global_opts(title_opts=opts.TitleOpts(title="帕累托模型", subtitle="累计销售额"),
#                     xaxis_opts=opts.AxisOpts(name='', name_location='middle', name_gap=35),  #  设置X轴标签
#                     yaxis_opts=opts.AxisOpts(name='', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
line.render_notebook()

    <div id="6084df0633cf4a048896a735c59a7adf" style="width:900px; height:500px;"></div>

灯下夜无眠

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫