关注微信公共号:小程在线
整体数据
关注CSDN博客:程志伟的博客
data.head()
Out[46]:
order_id product_id ... buy_cnt amount
0 2294359932054536986 1515966223509089906 ... 2 324.02
1 2294444024058086220 2273948319057183658 ... 2 155.04
2 2294584263154074236 2273948316817424439 ... 1 217.57
3 2295716521449619559 1515966223509261697 ... 1 39.33
4 2295740594749702229 1515966223509104892 ... 4 5548.04
[5 rows x 16 columns]
cnt = data.groupby('product_id').agg(销售总量=('buy_cnt','sum')).reset_index().sort_values('销售总量',ascending=False).reset_index(drop=True)
cnt.head(10)
Out[47]:
product_id 销售总量
0 1515966223517846928 2759
1 1515966223509106786 2597
2 1515966223509088532 2550
3 1515966223509088613 2549
4 1515966223509088567 2497
5 1515966223510174551 2370
6 1515966223509088521 2156
7 1515966223509104145 2037
8 1515966223509088639 1904
9 1515966223509117074 1797
cnt.describe(percentiles=(0.01,0.1,0.25,0.75,0.9,0.99))
Out[48]:
product_id 销售总量
count 19053.00 19053.00
mean 1799977560495135232.00 28.12
std 381362937154169920.00 93.48
min 1515966223509088512.00 1.00
1% 1515966223509089280.00 1.00
10% 1515966223509118464.00 1.00
25% 1515966223509298688.00 2.00
50% 1515966223510604544.00 5.00
75% 2273948287155306752.00 19.00
90% 2309018263300237312.00 64.00
99% 2388434452476071936.00 359.00
max 2388434452476881920.00 2759.00
cnt[cnt.销售总量<10]
Out[49]:
product_id 销售总量
6984 1515966223510210803 9
6985 1515966223510177837 9
6986 1515966223510338401 9
6987 1515966223509117524 9
6988 1515966223509106104 9
... ...
19048 2273948297037087392 1
19049 1515966223510071726 1
19050 2273948297011921539 1
19051 2273948297011921538 1
19052 2388434452476881700 1
[12069 rows x 2 columns]
amount = data.groupby('product_id').agg(销售总额=('amount','sum')).reset_index().sort_values('销售总额',ascending=False).reset_index(drop=True)
amount.head(10)
Out[50]:
product_id 销售总额
0 1515966223509088567 2138006.31
1 1515966223509088671 1540314.88
2 1515966223509089284 1062128.50
3 1515966223509088628 1007196.96
4 1515966223509088509 921041.65
5 1515966223509089438 850236.39
6 1515966223509088532 767295.00
7 1515966223509104892 760081.48
8 1515966223509105893 739893.40
9 1515966223509088639 661068.80
amount.describe(percentiles=(0.01,0.1,0.25,0.75,0.9,0.99))
Out[51]:
product_id 销售总额
count 19053.00 19053.00
mean 1799977560495128832.00 6035.09
std 381362937154172288.00 35954.79
min 1515966223509088512.00 0.00
1% 1515966223509089280.00 0.46
10% 1515966223509118464.00 8.24
25% 1515966223509298688.00 52.80
50% 1515966223510604544.00 323.10
75% 2273948287155306752.00 1909.49
90% 2309018263300237312.00 9436.11
99% 2388434452476071936.00 100748.55
max 2388434452476881920.00 2138006.31
cnt_category = data[data.category_code != 'R'].groupby('category_code').agg(销量=('buy_cnt','sum')).reset_index().sort_values('销量',ascending=False).reset_index(drop=True)
cnt_category.head(10)
Out[52]:
category_code 销量
0 electronics.smartphone 102169
1 computers.notebook 25860
2 appliances.kitchen.refrigerators 20020
3 electronics.audio.headphone 19739
4 electronics.video.tv 17623
5 appliances.environment.vacuum 15906
6 appliances.kitchen.washer 14163
7 appliances.kitchen.kettle 11869
8 computers.peripherals.mouse 10146
9 furniture.kitchen.table 9659
brand_5 = data[data.category_code=='electronics.smartphone'].groupby('brand').agg(销量=('buy_cnt','sum')).reset_index().sort_values('销量',ascending=False)
brand_5.reset_index(drop=True,inplace=True)
brand_5.head(5)
Out[53]:
brand 销量
0 samsung 51376
1 apple 23365
2 xiaomi 8328
3 huawei 7738
4 oppo 6876
brand_5['销量'].sum()
Out[54]: 102169
plt.pie(data=brand_5.head(5)
,x='销量'
,labels='brand'
,autopct='%.1f%%'
,textprops={'fontsize':12, 'color':'k'} # 设置文本标签的属性值
,radius=2
)
plt.show()
user_samsung = data_.loc[(data_['brand']=='samsung') & (data_['category_code']=='electronics.smartphone')]
user_samsung.head()
Out[56]:
order_id product_id ... amount age_bin
14 2297770405059888020 1515966223509088578 ... 300.90 (25-30]岁
17 2297896107595793042 1515966223509104759 ... 300.90 (40-45]岁
21 2297988436574864215 1515966223509089486 ... 115.72 (45-50]岁
27 2297993157859606924 1515966223509089076 ... 161.78 (35-40]岁
30 2297995500990759592 1515966223509104759 ... 300.90 (45-50]岁
[5 rows x 17 columns]
user_samsung.groupby('age_bin').agg(销量=('buy_cnt','sum'))
Out[57]:
销量
age_bin
(15-20]岁 7273
(20-25]岁 7125
(25-30]岁 7320
(30-35]岁 7593
(35-40]岁 6774
(40-45]岁 7651
(45-50]岁 7640
user_samsung.groupby('sex').agg(销量=('buy_cnt','sum'))
Out[58]:
销量
sex
女 25477
男 25899
local_brand = user_samsung.groupby('local').agg(销量=('buy_cnt','sum')).sort_values('销量',ascending=False)
local_brand
Out[59]:
销量
local
广东 11491
上海 8635
北京 8356
四川 3231
湖南 3120
海南 2874
重庆 2851
浙江 2828
江苏 2717
天津 2684
湖北 2589
plt.pie(x=local_brand['销量'].values
,labels=local_brand.index
,autopct='%.1f%%'
,textprops={'fontsize':12, 'color':'k'} # 设置文本标签的属性值
,counterclock = False # 是否逆时针,这里设置为顺时针方向
,startangle = 90 # 设置饼图的初始角度
,radius=2
)
plt.show()
user_apple = data_.loc[(data_['brand']=='apple') & (data_['category_code']=='electronics.smartphone')]
user_apple.head()
Out[61]:
order_id product_id ... amount age_bin
4 2295740594749702229 1515966223509104892 ... 5548.04 (20-25]岁
35 2298002125248004522 2273948241915544394 ... 346.97 (20-25]岁
121 2298085777554276936 1515966223509123272 ... 242.80 (40-45]岁
129 2298091624959311994 1515966223509089754 ... 208.31 (20-25]岁
192 2298165996101304765 1515966223509089406 ... 1433.54 (30-35]岁
[5 rows x 17 columns]
user_apple.groupby('age_bin').agg(销量=('buy_cnt','sum'))
Out[62]:
销量
age_bin
(15-20]岁 3190
(20-25]岁 3322
(25-30]岁 3429
(30-35]岁 3480
(35-40]岁 3246
(40-45]岁 3316
(45-50]岁 3382
user_apple.groupby('sex').agg(销量=('buy_cnt','sum'))
Out[63]:
销量
sex
女 11655
男 11710
local_brand_apple = user_apple.groupby('local').agg(销量=('buy_cnt','sum')).sort_values('销量',ascending=False)
local_brand_apple
Out[64]:
销量
local
广东 5265
北京 3831
上海 3830
海南 1408
湖南 1333
四川 1328
天津 1320
江苏 1309
重庆 1265
浙江 1245
湖北 1231
plt.pie(x=local_brand_apple['销量'].values
,labels=local_brand_apple.index
,autopct='%.1f%%'
,textprops={'fontsize':12, 'color':'k'} # 设置文本标签的属性值
,counterclock = False # 是否逆时针,这里设置为顺时针方向
,startangle = 90 # 设置饼图的初始角度
,radius=2
)
plt.show()