import pandas as pd
import numpy as np
import warnings
warnings. filterwarnings( 'ignore' )
pd. set_option( 'display.float_format' , lambda x : '%.2f' % x)
data = pd. read_csv( 'data.csv' , index_col= 0 )
data. shape
(564169, 11)
data. columns
Index(['event_time', 'order_id', 'product_id', 'category_id', 'category_code',
'brand', 'price', 'user_id', 'age', 'sex', 'local'],
dtype='object')
data. head( )
event_time order_id product_id category_id category_code brand price user_id age sex local 0 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 女 海南 1 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 女 海南 2 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 女 北京 3 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 女 北京 4 2020-04-24 19:16:21 UTC 2294584263154074236 2273948316817424439 2268105471367840000.00 NaN karcher 217.57 1515915625443148032.00 32.00 女 广东
data. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 564169 entries, 0 to 2633520
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 event_time 564169 non-null object
1 order_id 564169 non-null int64
2 product_id 564169 non-null int64
3 category_id 564169 non-null float64
4 category_code 434799 non-null object
5 brand 536945 non-null object
6 price 564169 non-null float64
7 user_id 564169 non-null float64
8 age 564169 non-null float64
9 sex 564169 non-null object
10 local 564169 non-null object
dtypes: float64(4), int64(2), object(5)
memory usage: 51.7+ MB
数据预处理
data[ 'date' ] = data. event_time. apply ( lambda x: x. split( ' ' ) [ 0 ] )
data. head( )
event_time order_id product_id category_id category_code brand price user_id age sex local date 0 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 女 海南 2020-04-24 1 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 女 海南 2020-04-24 2 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 女 北京 2020-04-24 3 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 女 北京 2020-04-24 4 2020-04-24 19:16:21 UTC 2294584263154074236 2273948316817424439 2268105471367840000.00 NaN karcher 217.57 1515915625443148032.00 32.00 女 广东 2020-04-24
data[ 'date' ] = pd. to_datetime( data[ 'date' ] )
data. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 564169 entries, 0 to 2633520
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 event_time 564169 non-null object
1 order_id 564169 non-null int64
2 product_id 564169 non-null int64
3 category_id 564169 non-null float64
4 category_code 434799 non-null object
5 brand 536945 non-null object
6 price 564169 non-null float64
7 user_id 564169 non-null float64
8 age 564169 non-null float64
9 sex 564169 non-null object
10 local 564169 non-null object
11 date 564169 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(2), object(5)
memory usage: 56.0+ MB
data[ 'month' ] = data. date. dt. month
data. head( )
event_time order_id product_id category_id category_code brand price user_id age sex local date month 0 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 女 海南 2020-04-24 4 1 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 女 海南 2020-04-24 4 2 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 女 北京 2020-04-24 4 3 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 女 北京 2020-04-24 4 4 2020-04-24 19:16:21 UTC 2294584263154074236 2273948316817424439 2268105471367840000.00 NaN karcher 217.57 1515915625443148032.00 32.00 女 广东 2020-04-24 4
data[ 'weekday' ] = data. date. apply ( lambda x: x. strftime( "%w" ) )
data. head( )
event_time order_id product_id category_id category_code brand price user_id age sex local date month weekday 0 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 女 海南 2020-04-24 4 5 1 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 女 海南 2020-04-24 4 5 2 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 女 北京 2020-04-24 4 5 3 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 女 北京 2020-04-24 4 5 4 2020-04-24 19:16:21 UTC 2294584263154074236 2273948316817424439 2268105471367840000.00 NaN karcher 217.57 1515915625443148032.00 32.00 女 广东 2020-04-24 4 5
del data[ 'event_time' ]
data. head( )
order_id product_id category_id category_code brand price user_id age sex local date month weekday 0 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 女 海南 2020-04-24 4 5 1 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 女 海南 2020-04-24 4 5 2 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 女 北京 2020-04-24 4 5 3 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 女 北京 2020-04-24 4 5 4 2294584263154074236 2273948316817424439 2268105471367840000.00 NaN karcher 217.57 1515915625443148032.00 32.00 女 广东 2020-04-24 4 5
data. isnull( ) . sum ( )
order_id 0
product_id 0
category_id 0
category_code 129370
brand 27224
price 0
user_id 0
age 0
sex 0
local 0
date 0
month 0
weekday 0
dtype: int64
data[ 'category_code' ] = data[ 'category_code' ] . fillna( 'M' )
data = data[ data. brand. notnull( ) ]
data. isnull( ) . sum ( )
order_id 0
product_id 0
category_id 0
category_code 0
brand 0
price 0
user_id 0
age 0
sex 0
local 0
date 0
month 0
weekday 0
dtype: int64
data. duplicated( ) . sum ( )
634
data = data. value_counts( ) . reset_index( ) . rename( columns= { 0 : "buy_count" } )
data[ 'buy_amount' ] = data[ 'price' ] * data[ 'buy_count' ]
data. head( )
order_id product_id category_id category_code brand price user_id age sex local date month weekday buy_count buy_amount 0 2318945879811162983 2309018204833317816 2268105479144079872.00 M compliment 0.56 1515915625465863936.00 28.00 女 浙江 2020-05-28 5 4 4 2.24 1 2295740594749702229 1515966223509104892 2268105428166508800.00 electronics.smartphone apple 1387.01 1515915625448766464.00 21.00 男 北京 2020-04-26 4 0 4 5548.04 2 2388440981134674698 1515966223509106757 2360741867017995776.00 appliances.environment.air_conditioner samsung 366.41 1515915625514599680.00 50.00 男 广东 2020-11-16 11 1 4 1465.64 3 2375043331555066740 2273948308370096764 2268105409048871168.00 computers.network.router altel 57.85 1515915625504379136.00 19.00 女 上海 2020-08-13 8 4 4 231.40 4 2334999887038383089 1515966223509090031 2268105402673529600.00 M vitek 18.50 1515915625447765248.00 18.00 男 广东 2020-06-19 6 5 3 55.50
data. describe( percentiles= [ 0.01 , 0.25 , 0.75 , 0.99 ] ) . T
count mean std min 1% 25% 50% 75% 99% max order_id 536311.00 2370510904966508032.00 20245175202374012.00 2294359932054536960.00 2305277023666307584.00 2353674406846267392.00 2376454570843832320.00 2388440981134596608.00 2388440981134690304.00 2388440981134693888.00 product_id 536311.00 1692699904736436480.00 327324678972381504.00 1515966223509088512.00 1515966223509088512.00 1515966223509104896.00 1515966223509261824.00 1515966223527326208.00 2388434452475807744.00 2388434452476881920.00 category_id 536311.00 2273068434487365888.00 21891838583158944.00 2268105388421284352.00 2268105388991709952.00 2268105406549066752.00 2268105428166508800.00 2268105439323357952.00 2374498914000592384.00 2374498914001945600.00 price 536311.00 214.54 305.98 0.00 1.13 24.51 99.51 289.33 1387.01 11574.05 user_id 536311.00 1515915625493883648.00 24990827.60 1515915625439951872.00 1515915625441152000.00 1515915625467037184.00 1515915625486696704.00 1515915625511521280.00 1515915625514800128.00 1515915625514891264.00 age 536311.00 33.18 10.12 16.00 16.00 24.00 33.00 42.00 50.00 50.00 month 536311.00 7.72 2.56 1.00 1.00 6.00 8.00 10.00 11.00 11.00 buy_count 536311.00 1.00 0.04 1.00 1.00 1.00 1.00 1.00 1.00 4.00 buy_amount 536311.00 214.73 306.48 0.00 1.13 24.98 99.51 289.33 1387.01 11574.05
data. describe( include= 'all' ) . T
count unique top freq first last mean std min 25% 50% 75% max order_id 536311.00 NaN NaN NaN NaT NaT 2370510904966508032.00 20245175202374012.00 2294359932054536960.00 2353674406846267392.00 2376454570843832320.00 2388440981134596608.00 2388440981134693888.00 product_id 536311.00 NaN NaN NaN NaT NaT 1692699904736436480.00 327324678972381504.00 1515966223509088512.00 1515966223509104896.00 1515966223509261824.00 1515966223527326208.00 2388434452476881920.00 category_id 536311.00 NaN NaN NaN NaT NaT 2273068434487365888.00 21891838583158944.00 2268105388421284352.00 2268105406549066752.00 2268105428166508800.00 2268105439323357952.00 2374498914001945600.00 category_code 536311 124 M 116093 NaT NaT NaN NaN NaN NaN NaN NaN NaN brand 536311 868 samsung 96123 NaT NaT NaN NaN NaN NaN NaN NaN NaN price 536311.00 NaN NaN NaN NaT NaT 214.54 305.98 0.00 24.51 99.51 289.33 11574.05 user_id 536311.00 NaN NaN NaN NaT NaT 1515915625493883648.00 24990827.60 1515915625439951872.00 1515915625467037184.00 1515915625486696704.00 1515915625511521280.00 1515915625514891264.00 age 536311.00 NaN NaN NaN NaT NaT 33.18 10.12 16.00 24.00 33.00 42.00 50.00 sex 536311 2 男 270454 NaT NaT NaN NaN NaN NaN NaN NaN NaN local 536311 11 广东 117097 NaT NaT NaN NaN NaN NaN NaN NaN NaN date 536311 323 2020-10-22 00:00:00 8310 1970-01-01 2020-11-21 NaN NaN NaN NaN NaN NaN NaN month 536311.00 NaN NaN NaN NaT NaT 7.72 2.56 1.00 6.00 8.00 10.00 11.00 weekday 536311 7 6 86379 NaT NaT NaN NaN NaN NaN NaN NaN NaN buy_count 536311.00 NaN NaN NaN NaT NaT 1.00 0.04 1.00 1.00 1.00 1.00 4.00 buy_amount 536311.00 NaN NaN NaN NaT NaT 214.73 306.48 0.00 24.98 99.51 289.33 11574.05
data = data[ data. date> '1970-01-01' ]
data. date. min ( )
Timestamp('2020-01-05 00:00:00')
data. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 535065 entries, 0 to 536310
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 535065 non-null int64
1 product_id 535065 non-null int64
2 category_id 535065 non-null float64
3 category_code 535065 non-null object
4 brand 535065 non-null object
5 price 535065 non-null float64
6 user_id 535065 non-null float64
7 age 535065 non-null float64
8 sex 535065 non-null object
9 local 535065 non-null object
10 date 535065 non-null datetime64[ns]
11 month 535065 non-null int64
12 weekday 535065 non-null object
13 buy_count 535065 non-null int64
14 buy_amount 535065 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(4), object(5)
memory usage: 65.3+ MB
data. shape
(535065, 15)
数据分析
基础维度分析:地域、年龄、性别、0元用户 产品分析:销量前10、销售额前10、销量前5人群特征 用户分析:生命周期模型、帕累托模型、RFM模型
data_local = data. groupby( 'local' ) [ 'user_id' ] . nunique( ) . reset_index( ) . sort_values( 'user_id' , ascending= False )
data_local. rename( columns= { 'user_id' : '用户数' } , inplace= True )
data_local
local 用户数 4 广东 21382 0 上海 16031 1 北京 15928 5 江苏 5561 7 海南 5449 2 四川 5445 6 浙江 5370 8 湖北 5355 10 重庆 5342 3 天津 5337 9 湖南 5330
from pyecharts import options as opts
from pyecharts. charts import Bar, Line
bar = Bar( )
bar. add_xaxis( data_local. local. tolist( ) )
bar. add_yaxis( '用户数' , data_local[ '用户数' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "各地区域用户数量情况" , subtitle= "区域用户数" ) ,
xaxis_opts= opts. AxisOpts( name= '区域' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '用户数' , name_location= 'middle' , name_gap= 70 ) ,
toolbox_opts= opts. ToolboxOpts( ) )
bar. render_notebook( )
<div id="dae8b16860cc447b9d61f32b0227b510" style="width:900px; height:500px;"></div>
sex = data. groupby( 'sex' ) [ 'user_id' ] . nunique( ) . reset_index( )
sex. rename( columns= { 'user_id' : '用户数量' } , inplace= True )
sex
bar = Bar( )
bar. add_xaxis( sex. sex. tolist( ) )
bar. add_yaxis( '用户' , sex[ '用户数量' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "性别分类用户数情况" , subtitle= "用户数" ) ,
xaxis_opts= opts. AxisOpts( name= '性别' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '用户数' , name_location= 'middle' , name_gap= 70 ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
bar. render_notebook( )
<div id="da7684cee9fd43ff83621c244ab2c8f6" style="width:900px; height:500px;"></div>
age = data. groupby( 'age' ) [ 'user_id' ] . nunique( ) . reset_index( )
age. rename( columns= { 'user_id' : '用户数量' } , inplace= True )
age = age. sort_values( by= '用户数量' , ascending= False )
age
age 用户数量 27 43.00 2875 28 44.00 2854 16 32.00 2841 4 20.00 2838 12 28.00 2825 7 23.00 2820 22 38.00 2820 23 39.00 2813 20 36.00 2805 13 29.00 2802 0 16.00 2797 10 26.00 2793 6 22.00 2792 5 21.00 2786 29 45.00 2784 32 48.00 2781 15 31.00 2772 9 25.00 2768 8 24.00 2764 19 35.00 2764 26 42.00 2763 2 18.00 2759 14 30.00 2755 25 41.00 2753 17 33.00 2745 18 34.00 2738 1 17.00 2725 11 27.00 2724 34 50.00 2724 30 46.00 2713 21 37.00 2710 33 49.00 2707 3 19.00 2703 24 40.00 2685 31 47.00 2677
data_age = data. copy( )
bins = [ 15 , 20 , 25 , 30 , 35 , 40 , 45 , 50 ]
labels = [ '(15-20]' , '(20-25]' , '(25-30]' , '(30-35]' , '(35-40]' , '(40-45]' , '(45-50]' ]
data_age[ 'age_bin' ] = pd. cut( x= data. age, bins= bins, labels= labels)
age = data_age. groupby( 'age_bin' ) [ 'user_id' ] . nunique( ) . reset_index( )
age. rename( columns= { 'user_id' : '用户数量' } , inplace= True )
age = age. sort_values( by= '用户数量' , ascending= False )
age
age_bin 用户数量 5 (40-45] 13969 1 (20-25] 13867 2 (25-30] 13831 3 (30-35] 13802 4 (35-40] 13775 0 (15-20] 13726 6 (45-50] 13535
bar = Bar( )
bar. add_xaxis( age. age_bin. tolist( ) )
bar. add_yaxis( '用户' , age[ '用户数量' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "各年龄用户数情况" , subtitle= "用户数" ) ,
xaxis_opts= opts. AxisOpts( name= '年龄段' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '用户数' , name_location= 'middle' , name_gap= 70 ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
bar. render_notebook( )
<div id="4cf24a3b14b24b4e902b467ca391133d" style="width:900px; height:500px;"></div>
age_bin_data = data_age. groupby( 'age_bin' ) . agg( 消费金额= ( 'buy_amount' , 'sum' ) , 下单次数= ( 'order_id' , 'nunique' ) )
age_bin_data
消费金额 下单次数 age_bin (15-20] 16007287.18 54302 (20-25] 16500743.70 55546 (25-30] 16513446.00 54723 (30-35] 17004578.70 58275 (35-40] 14995577.44 49744 (40-45] 17078724.54 59067 (45-50] 16886278.32 57085
bar1 = (
Bar( )
. add_xaxis( list ( age_bin_data. index) )
. add_yaxis( '消费金额' , list ( age_bin_data[ '消费金额' ] ) , yaxis_index= 0 )
. extend_axis( yaxis= opts. AxisOpts( type_= "value" ,
splitline_opts= opts. SplitLineOpts( is_show= True ) , ) )
. set_global_opts( title_opts= opts. TitleOpts( title= '各年龄段消费金额及下单次数' , subtitle= '消费金额' ) )
)
bar2 = (
Bar( )
. add_xaxis( list ( age_bin_data. index) )
. add_yaxis( '下单次数' , list ( age_bin_data[ '下单次数' ] ) , yaxis_index= 1 )
. set_global_opts( title_opts= opts. TitleOpts( title= '各年龄段消费金额及下单次数' , subtitle= '下单次数' ) )
)
bar1. overlap( bar2)
bar1. render_notebook( )
<div id="f0f2a988d9144ec1aeef7b7d28826084" style="width:900px; height:500px;"></div>
user_0 = data[ data[ 'price' ] == 0 ] [ 'user_id' ] . reset_index( drop= True )
user_0. shape
(30,)
user_0[ ~ user_0. isin( data[ data[ 'price' ] > 0 ] [ 'user_id' ] ) ]
21 1515915625468531712.00
Name: user_id, dtype: float64
top = data. groupby( 'product_id' ) . agg( 销售总量= ( 'buy_count' , 'sum' ) ) . reset_index( ) . sort_values( '销售总量' , ascending= False ) . reset_index( drop= True ) . head( 10 )
top
product_id 销售总量 0 1515966223517846928 2759 1 1515966223509106786 2597 2 1515966223509088532 2550 3 1515966223509088613 2549 4 1515966223509088567 2497 5 1515966223510174551 2370 6 1515966223509088521 2156 7 1515966223509104145 2037 8 1515966223509088639 1904 9 1515966223509117074 1797
bar = Bar( )
bar. add_xaxis( top. product_id. tolist( ) )
bar. add_yaxis( '销量' , top[ '销售总量' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "产品销量前十" , subtitle= '产品销售量' ) ,
xaxis_opts= opts. AxisOpts( name= '产品' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '销量' , name_location= 'middle' , name_gap= 70 ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
bar. render_notebook( )
<div id="7038c9df402a44bfb1327f249699ac13" style="width:900px; height:500px;"></div>
category_top = data[ data. category_code != "M" ] . groupby( 'category_code' ) . agg( 销量= ( 'buy_count' , 'sum' ) ) . reset_index( ) . sort_values( '销量' , ascending= False ) . head( 10 )
category_top
category_code 销量 88 electronics.smartphone 102169 58 computers.notebook 25860 30 appliances.kitchen.refrigerators 20020 81 electronics.audio.headphone 19739 92 electronics.video.tv 17623 13 appliances.environment.vacuum 15906 33 appliances.kitchen.washer 14163 25 appliances.kitchen.kettle 11869 63 computers.peripherals.mouse 10146 99 furniture.kitchen.table 9659
bar = Bar( )
bar. add_xaxis( category_top[ 'category_code' ] . tolist( ) )
bar. add_yaxis( '销量' , category_top[ '销量' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "产品类别销量前十" , subtitle= "类别销量" ) ,
xaxis_opts= opts. AxisOpts( name= '产品类别' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '销量' , name_location= 'middle' , name_gap= 70 ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
bar. render_notebook( )
<div id="1f90c798f25c4c01a0db3a6d918d542a" style="width:900px; height:500px;"></div>
product_amount_top = data. groupby( 'product_id' ) . agg( 销售额= ( 'buy_amount' , 'sum' ) ) . reset_index( ) . sort_values( '销售额' , ascending= False ) . head( 10 )
product_amount_top
product_id 销售额 46 1515966223509088567 2138006.31 95 1515966223509088671 1540314.88 183 1515966223509089284 1062128.50 67 1515966223509088628 1007196.96 12 1515966223509088509 921041.65 251 1515966223509089438 850236.39 26 1515966223509088532 767295.00 1035 1515966223509104892 760081.48 1221 1515966223509105893 739893.40 72 1515966223509088639 661068.80
bar = Bar( )
bar. add_xaxis( product_amount_top. product_id. tolist( ) )
bar. add_yaxis( '销售额' , product_amount_top[ '销售额' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "产品销售额情况" , subtitle= '销售额前十产品' ) ,
xaxis_opts= opts. AxisOpts( name= '产品' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '销售额' , name_location= 'middle' , name_gap= 70 ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
bar. render_notebook( )
<div id="f3a5b14750424d7a879434e0c06035f5" style="width:900px; height:500px;"></div>
category_amount_top = data[ data. category_code != "M" ] . groupby( 'category_code' ) . agg( 销售额= ( 'buy_amount' , 'sum' ) ) . reset_index( ) . sort_values( '销售额' , ascending= False ) . head( 10 )
category_amount_top
category_code 销售额 88 electronics.smartphone 41214019.09 58 computers.notebook 14785855.65 92 electronics.video.tv 8724539.82 30 appliances.kitchen.refrigerators 8598364.87 33 appliances.kitchen.washer 4923616.33 13 appliances.environment.vacuum 2439553.99 89 electronics.tablet 2343150.48 87 electronics.clocks 2180030.73 9 appliances.environment.air_conditioner 1826364.85 23 appliances.kitchen.hood 1799268.89
bar = Bar( )
bar. add_xaxis( category_amount_top. category_code. tolist( ) )
bar. add_yaxis( '销售额' , category_amount_top[ '销售额' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "产品类别销售额情况" , subtitle= '销售额前十产品类别' ) ,
xaxis_opts= opts. AxisOpts( name= '产品类别' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '销售额' , name_location= 'middle' , name_gap= 75 ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
bar. render_notebook( )
<div id="0cd0495fa04746748323be044d107b4c" style="width:900px; height:500px;"></div>
brand_top = data[ data. category_code == 'electronics.smartphone' ] . groupby( 'brand' ) . agg( 销量= ( 'buy_count' , 'sum' ) ) . reset_index( ) . sort_values( '销量' , ascending= False ) . head( 5 )
brand_top
brand 销量 14 samsung 51376 1 apple 23365 18 xiaomi 8328 7 huawei 7738 12 oppo 6876
bar = Bar( )
bar. add_xaxis( brand_top. brand. tolist( ) )
bar. add_yaxis( '销量' , brand_top[ '销量' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "手机品牌销量情况" , subtitle= '销量前5手机品牌' ) ,
xaxis_opts= opts. AxisOpts( name= '手机品牌' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '销量' , name_location= 'middle' , name_gap= 60 ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
bar. render_notebook( )
<div id="22e71d28a7cc4bc29596388da0bded6d" style="width:900px; height:500px;"></div>
age_bin_sumsung_phone = data_age[ ( data_age. brand == 'samsung' ) & ( data_age. category_code == 'electronics.smartphone' ) ] . groupby( 'age_bin' ) . agg( 销量= ( 'buy_count' , 'sum' ) ) . reset_index( ) . sort_values( '销量' , ascending= False )
age_bin_sumsung_phone
age_bin 销量 5 (40-45] 7651 6 (45-50] 7640 3 (30-35] 7593 2 (25-30] 7320 0 (15-20] 7273 1 (20-25] 7125 4 (35-40] 6774
bar = Bar( )
bar. add_xaxis( age_bin_sumsung_phone. age_bin. tolist( ) )
bar. add_yaxis( '销量' , age_bin_sumsung_phone[ '销量' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "三星手机用户各年龄段情况" , subtitle= "三星手机用户画像" ) ,
xaxis_opts= opts. AxisOpts( name= '年龄段' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '销量' , name_location= 'middle' , name_gap= 60 ) ,
toolbox_opts= opts. ToolboxOpts( ) )
bar. render_notebook( )
<div id="d39450a08c40435d9334a37616a63d8e" style="width:900px; height:500px;"></div>
local_sumsung_phone = data_age[ ( data_age. brand == 'samsung' ) & ( data_age. category_code == 'electronics.smartphone' ) ] . groupby( 'local' ) . agg( 销量= ( 'buy_count' , 'sum' ) ) . reset_index( ) . sort_values( '销量' , ascending= False )
local_sumsung_phone
local 销量 4 广东 11491 0 上海 8635 1 北京 8356 2 四川 3231 9 湖南 3120 7 海南 2874 10 重庆 2851 6 浙江 2828 5 江苏 2717 3 天津 2684 8 湖北 2589
bar = Bar( )
bar. add_xaxis( local_sumsung_phone. local. tolist( ) )
bar. add_yaxis( '销量' , local_sumsung_phone[ '销量' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "三星手机用户各区域销量情况" , subtitle= "区域用户画像" ) ,
xaxis_opts= opts. AxisOpts( name= '区域' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '销量' , name_location= 'middle' , name_gap= 60 ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
bar. render_notebook( )
<div id="591a52be7a6d47e6bd7c33646258990f" style="width:900px; height:500px;"></div>
sex_samsung_phone = data_age[ ( data_age. brand == 'samsung' ) & ( data_age. category_code == 'electronics.smartphone' ) ] . groupby( 'sex' ) . agg( 销量= ( 'buy_count' , 'sum' ) ) . reset_index( )
sex_samsung_phone
bar = Bar( )
bar. add_xaxis( sex_samsung_phone. sex. tolist( ) )
bar. add_yaxis( '销量' , sex_samsung_phone[ '销量' ] . tolist( ) )
bar. set_global_opts( title_opts= opts. TitleOpts( title= "三星手机用户各性别销量情况" , subtitle= "性别用户画像" ) ,
xaxis_opts= opts. AxisOpts( name= '性别' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '销量' , name_location= 'middle' , name_gap= 60 ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
bar. render_notebook( )
<div id="54166c92e64b4d61ae7b039fbf56c68c" style="width:900px; height:500px;"></div>
用户生命周期模型
pivoted_amount = data[ data[ 'buy_amount' ] > 0 ] . pivot_table( index= 'user_id' ,
columns= 'month' ,
values= 'buy_count' ,
aggfunc= 'sum' ) . fillna( 0 )
pivoted_amount. head( )
month 1 2 3 4 5 6 7 8 9 10 11 user_id 1515915625439951872.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 0.00 0.00 0.00 0.00 1515915625440038400.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00 1515915625440051712.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 3.00 10.00 1515915625440099840.00 0.00 0.00 0.00 0.00 2.00 1.00 1.00 0.00 5.00 10.00 2.00 1515915625440121600.00 0.00 0.00 0.00 0.00 1.00 0.00 1.00 0.00 0.00 0.00 0.00
columns_month = pivoted_amount. columns. astype( 'str' )
pivoted_amount. columns = columns_month
pivoted_purchase = pivoted_amount. applymap( lambda x: 1 if x> 0 else 0 )
pivoted_purchase. head( )
month 1 2 3 4 5 6 7 8 9 10 11 user_id 1515915625439951872.00 0 0 0 0 0 0 1 0 0 0 0 1515915625440038400.00 0 0 0 0 0 0 0 0 1 1 0 1515915625440051712.00 0 0 0 0 0 0 0 0 0 1 1 1515915625440099840.00 0 0 0 0 1 1 1 0 1 1 1 1515915625440121600.00 0 0 0 0 1 0 1 0 0 0 0
def active_status ( data) :
status = [ ]
for i in range ( 11 ) :
if data[ i] == 0 :
if len ( status) > 0 :
if status[ i- 1 ] == '未注册' :
status. append( '未注册' )
else :
status. append( '不活跃' )
else :
status. append( '未注册' )
else :
if len ( status) == 0 :
status. append( '新客户' )
else :
if status[ i- 1 ] == '不活跃' :
status. append( '回流' )
elif status[ i- 1 ] == '未注册' :
status. append( '新客户' )
else :
status. append( '活跃' )
return pd. Series( status, index= columns_month)
pivoted_purchase_status = pivoted_purchase. apply ( lambda x: active_status( x) , axis= 1 )
pivoted_purchase_status. head( )
month 1 2 3 4 5 6 7 8 9 10 11 user_id 1515915625439951872.00 未注册 未注册 未注册 未注册 未注册 未注册 新客户 不活跃 不活跃 不活跃 不活跃 1515915625440038400.00 未注册 未注册 未注册 未注册 未注册 未注册 未注册 未注册 新客户 活跃 不活跃 1515915625440051712.00 未注册 未注册 未注册 未注册 未注册 未注册 未注册 未注册 未注册 新客户 活跃 1515915625440099840.00 未注册 未注册 未注册 未注册 新客户 活跃 活跃 不活跃 回流 活跃 活跃 1515915625440121600.00 未注册 未注册 未注册 未注册 新客户 不活跃 回流 不活跃 不活跃 不活跃 不活跃
purchase_count = pivoted_purchase_status. apply ( lambda x: x. value_counts( ) )
purchase_count = purchase_count[ purchase_count. index != "未注册" ]
purchase_count = purchase_count. fillna( 0 )
purchase_count
month 1 2 3 4 5 6 7 8 9 10 11 不活跃 0.00 1190.00 2346 3685 5852 18118 24187 40985 64665 75528 84256.00 回流 0.00 0.00 261 432 1377 1251 3462 4244 4199 3740 2444.00 新客户 1813.00 1613.00 1491 4177 13914 8950 22036 22052 8163 5607 2938.00 活跃 0.00 623.00 819 800 1865 3639 4309 8765 7182 4941 3116.00
purchase_count = purchase_count. T
purchase_count
不活跃 回流 新客户 活跃 month 1 0.00 0.00 1813.00 0.00 2 1190.00 0.00 1613.00 623.00 3 2346.00 261.00 1491.00 819.00 4 3685.00 432.00 4177.00 800.00 5 5852.00 1377.00 13914.00 1865.00 6 18118.00 1251.00 8950.00 3639.00 7 24187.00 3462.00 22036.00 4309.00 8 40985.00 4244.00 22052.00 8765.00 9 64665.00 4199.00 8163.00 7182.00 10 75528.00 3740.00 5607.00 4941.00 11 84256.00 2444.00 2938.00 3116.00
line = Line( )
line. add_xaxis( purchase_count. index. tolist( ) )
line. add_yaxis( '不活跃' , purchase_count[ '不活跃' ] . tolist( ) )
line. add_yaxis( '活跃' , purchase_count[ '活跃' ] . tolist( ) )
line. add_yaxis( '新客户' , purchase_count[ '新客户' ] . tolist( ) )
line. add_yaxis( '回流' , purchase_count[ '回流' ] . tolist( ) )
line. set_global_opts( title_opts= opts. TitleOpts( title= "用户生命周期分层情况" , subtitle= "用户分层" ) ,
xaxis_opts= opts. AxisOpts( name= '月份' , name_location= 'middle' , name_gap= 35 ) ,
yaxis_opts= opts. AxisOpts( name= '分层' , name_location= 'middle' , name_gap= 60 ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
line. render_notebook( )
<div id="5314f0ff88d54f9b9d2dc2234bab163d" style="width:900px; height:500px;"></div>
帕累托模型
user_2_8 = data. groupby( 'user_id' ) . agg( 消费金额= ( 'buy_amount' , 'sum' ) ) . sort_values( '消费金额' , ascending= False ) . reset_index( )
user_2_8[ '累计销售额' ] = user_2_8[ '消费金额' ] . cumsum( )
user_2_8. head( )
user_id 消费金额 累计销售额 0 1515915625512422912.00 160604.07 160604.07 1 1515915625513695488.00 158277.37 318881.44 2 1515915625512377088.00 149967.06 468848.50 3 1515915625513577472.00 135672.84 604521.34 4 1515915625514597888.00 133945.88 738467.22
p = user_2_8[ '消费金额' ] . cumsum( ) / user_2_8[ '消费金额' ] . sum ( )
key = p[ p> 0.8 ] . index[ 0 ]
key
25408
key / user_2_8. shape[ 0 ]
0.2739259339119185
line = Line( )
line. add_xaxis( user_2_8. index. tolist( ) )
line. add_yaxis( '累计销售额' , user_2_8[ '累计销售额' ] . tolist( ) )
line. set_global_opts( title_opts= opts. TitleOpts( title= "帕累托模型" , subtitle= "累计销售额" ) ,
toolbox_opts= opts. ToolboxOpts( item_gap= 5 ) )
line. render_notebook( )
<div id="6084df0633cf4a048896a735c59a7adf" style="width:900px; height:500px;"></div>