import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif']=['simhei']
from datetime import datetime
df=pd.read_csv('kelu.csv')
数据初识
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8757 entries, 0 to 8756
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 author 8757 non-null object
1 rating 8757 non-null int64
2 time 8757 non-null object
3 year 8757 non-null int64
4 amount 8757 non-null int64
5 frequency 8757 non-null int64
dtypes: int64(4), object(2)
memory usage: 410.6+ KB
df.head()
| author | rating | time | year | amount | frequency |
---|
0 | YOUNG | 60 | 2019/2/28 | 2019 | 110 | 1 |
---|
1 | SHING YAN | 100 | 2019/2/28 | 2019 | 110 | 1 |
---|
2 | Sha | 60 | 2019/2/28 | 2019 | 110 | 1 |
---|
3 | Mary Mae | 100 | 2019/2/28 | 2019 | 110 | 1 |
---|
4 | Maria Cristina | 100 | 2019/2/28 | 2019 | 110 | 1 |
---|
df.describe()
| rating | year | amount | frequency |
---|
count | 8757.000000 | 8757.000000 | 8757.0 | 8757.0 |
---|
mean | 92.417495 | 2017.760420 | 110.0 | 1.0 |
---|
std | 14.231179 | 0.686734 | 0.0 | 0.0 |
---|
min | 20.000000 | 2016.000000 | 110.0 | 1.0 |
---|
25% | 80.000000 | 2017.000000 | 110.0 | 1.0 |
---|
50% | 100.000000 | 2018.000000 | 110.0 | 1.0 |
---|
75% | 100.000000 | 2018.000000 | 110.0 | 1.0 |
---|
max | 100.000000 | 2019.000000 | 110.0 | 1.0 |
---|
每日销量分析
df['time'] = pd.to_datetime(df['time'],format='%Y/%m/%d')
df.groupby('time')['rating'].count().plot(figsize=(12,4))
plt.xlabel('日期')
plt.ylabel('销量')
plt.title('16-19年每日销量分析')
Text(0.5, 1.0, '16-19年每日销量分析')
![png](https://i-blog.csdnimg.cn/blog_migrate/4b74708e217015bd1e2f10c9ee3471ee.png#pic_center)
每月销量分析
df['month']= df['time'].values.astype('datetime64[M]')
df.head()
| author | rating | time | year | amount | frequency | month |
---|
0 | YOUNG | 60 | 2019-02-28 | 2019 | 110 | 1 | 2019-02-01 |
---|
1 | SHING YAN | 100 | 2019-02-28 | 2019 | 110 | 1 | 2019-02-01 |
---|
2 | Sha | 60 | 2019-02-28 | 2019 | 110 | 1 | 2019-02-01 |
---|
3 | Mary Mae | 100 | 2019-02-28 | 2019 | 110 | 1 | 2019-02-01 |
---|
4 | Maria Cristina | 100 | 2019-02-28 | 2019 | 110 | 1 | 2019-02-01 |
---|
df.groupby('month')['rating'].count().plot(figsize=(12,4))
plt.xlabel('月份')
plt.ylabel('销售')
plt.title('16~19年每月销量分析')
Text(0.5, 1.0, '16~19年每月销量分析')
![png](https://i-blog.csdnimg.cn/blog_migrate/813354027a1c63f828390165ccb2c821.png#pic_center)
用户购买门票数量分析
df.groupby('author')['frequency'].sum().plot.hist(bins=40)
plt.xlim(1,10)
plt.xlabel('用户购买门票数')
plt.ylabel('用户数')
plt.title('用户三年来购买门票数直方图')
Text(0.5, 1.0, '用户三年来购买门票数直方图')
![png](https://i-blog.csdnimg.cn/blog_migrate/949ec24fd14f9a2dbde2fc8fa928fc86.png#pic_center)
多次购买用户情况分析
df_f2 = df.groupby('author')['frequency'].sum().reset_index()
print(df_f2.head())
print("max:")
print(df_f2['frequency'].max())
df_f2[df_f2['frequency']>1].groupby('author')['frequency'].sum().plot.hist(bins = 40)
plt.xlabel('用户购买门票数')
plt.ylabel('用户数')
plt.title('多次购买用户情况分析直方图')
author frequency
0 wenbiao 1
1 Goh Yu Wen Eunice 1
2 Hui Shan 1
3 Huihui 1
4 KO-CHENG 1
max:
18
Text(0.5, 1.0, '多次购买用户情况分析直方图')
![png](https://i-blog.csdnimg.cn/blog_migrate/abd76c288415934649a7136dcea6c113.png#pic_center)
多次购买用户的具体人数
df_f2[df_f2['frequency']>1].groupby('frequency')['author'].count()
frequency
2 402
3 99
4 49
5 25
6 13
7 7
8 4
11 1
13 1
15 1
18 1
Name: author, dtype: int64
1-5次购买用户占比分析
df_f5=df.groupby('author')['frequency'].sum().reset_index()
df_f5.head()
values = list(df_f5[df_f5['frequency']<=6].groupby('frequency')['frequency'].sum())
print(values)
plt.pie(values,labels=['购买一次','购买两次','购买三次','购买四次','购买五次','购买六次'],autopct='%1.1f%%')
plt.title('1-5次购买次数用户占比分析饼图')
plt.legend()
[7119, 804, 297, 196, 125, 78]
<matplotlib.legend.Legend at 0x25f41b69ea0>
![png](https://i-blog.csdnimg.cn/blog_migrate/b277b58bf0260eb2cbe0ad7a6717497c.png#pic_center)
2-6次购买次数用户占比分析
df_f51=df_f5[df_f5['frequency']>1].reset_index()
df_f51.head()
values = list(df_f51[df_f51['frequency']<=6].groupby('frequency')['frequency'].sum())
print(values)
plt.pie(values,labels=['购买两次','购买三次','购买四次','购买五次','购买六次'],autopct='%1.1f%%')
plt.title('2-6次购买次数用户占比分析饼图')
plt.legend()
[804, 297, 196, 125, 78]
<matplotlib.legend.Legend at 0x25f41c1d6f0>
![png](https://i-blog.csdnimg.cn/blog_migrate/827837a70b61e3b24594a431f6145512.png#pic_center)
复购率分析
df.head()
pivot_count = df.pivot_table(index='author',
columns='month',
values='frequency',
aggfunc='count'
).fillna(0)
pivot_count = pivot_count.applymap(lambda x: 1 if x>1 else np.nan if x==0 else 0)
(pivot_count.sum()/pivot_count.count()).plot()
plt.xlabel('时间(月)')
plt.ylabel('百分比(%)')
plt.title('16-19年每月用户复购率')
Text(0.5, 1.0, '16-19年每月用户复购率')
![png](https://i-blog.csdnimg.cn/blog_migrate/dee70a5a7d7e9419b4f106d241185369.png#pic_center)
复购用户人数分析
pivot_count.sum().plot()
plt.xlabel('时间/月')
plt.ylabel('复购人数')
plt.title('16-19年复购人数')
Text(0.5, 1.0, '16-19年复购人数')
![png](https://i-blog.csdnimg.cn/blog_migrate/b26bfbbec6802e41d9772215d0bacd81.png#pic_center)
回购率分析
pivot_purchase = df.pivot_table(index='author',
columns='month',
values='frequency',
aggfunc='count').fillna(0)
print(pivot_purchase.head())
len(pivot_purchase.columns)
month 2016-08-01 2016-09-01 2016-10-01 2016-11-01 \
author
wenbiao 0.0 0.0 0.0 0.0
Goh Yu Wen Eunice 0.0 0.0 0.0 0.0
Hui Shan 0.0 0.0 0.0 0.0
Huihui 0.0 0.0 0.0 0.0
KO-CHENG 0.0 0.0 0.0 0.0
month 2016-12-01 2017-01-01 2017-02-01 2017-03-01 \
author
wenbiao 0.0 0.0 0.0 0.0
Goh Yu Wen Eunice 0.0 0.0 0.0 0.0
Hui Shan 1.0 0.0 0.0 0.0
Huihui 0.0 0.0 0.0 0.0
KO-CHENG 0.0 0.0 0.0 0.0
month 2017-04-01 2017-05-01 ... 2018-05-01 2018-06-01 \
author ...
wenbiao 0.0 0.0 ... 0.0 0.0
Goh Yu Wen Eunice 0.0 0.0 ... 0.0 0.0
Hui Shan 0.0 0.0 ... 0.0 0.0
Huihui 0.0 0.0 ... 0.0 0.0
KO-CHENG 0.0 0.0 ... 0.0 0.0
month 2018-07-01 2018-08-01 2018-09-01 2018-10-01 \
author
wenbiao 0.0 0.0 0.0 0.0
Goh Yu Wen Eunice 0.0 0.0 0.0 0.0
Hui Shan 0.0 0.0 0.0 0.0
Huihui 0.0 0.0 0.0 0.0
KO-CHENG 1.0 0.0 0.0 0.0
month 2018-11-01 2018-12-01 2019-01-01 2019-02-01
author
wenbiao 0.0 1.0 0.0 0.0
Goh Yu Wen Eunice 0.0 0.0 0.0 0.0
Hui Shan 0.0 0.0 0.0 0.0
Huihui 0.0 0.0 0.0 0.0
KO-CHENG 0.0 0.0 0.0 0.0
[5 rows x 31 columns]
31
def purchase_return(data):
status = []
for i in range(30):
if data[i]==1:
if data[i+1]==1:
status.append(1)
else:
status.append(0)
else:
status.append(np.nan)
status.append(np.nan)
return pd.Series(status,pivot_purchase.columns)
pivot_purchase_return=pivot_purchase.apply(purchase_return,axis=1)
(pivot_purchase_return.sum()/pivot_purchase_return.count()).plot()
plt.title('16年~19年每月的回购率')
plt.xlabel('月份')
plt.ylabel('回购率%')
Text(0, 0.5, '回购率%')
![png](https://i-blog.csdnimg.cn/blog_migrate/9c56bf16e11856b79c8046c584521a7c.png#pic_center)
回购人数分析
pivot_purchase_return.sum().plot()
plt.title('16年~19年每月的回购人数')
plt.xlabel('月份')
plt.ylabel('回购人数')
print(pivot_purchase_return.sum())
month
2016-08-01 0.0
2016-09-01 0.0
2016-10-01 0.0
2016-11-01 0.0
2016-12-01 2.0
2017-01-01 1.0
2017-02-01 1.0
2017-03-01 2.0
2017-04-01 2.0
2017-05-01 2.0
2017-06-01 0.0
2017-07-01 3.0
2017-08-01 2.0
2017-09-01 3.0
2017-10-01 3.0
2017-11-01 6.0
2017-12-01 4.0
2018-01-01 2.0
2018-02-01 4.0
2018-03-01 3.0
2018-04-01 2.0
2018-05-01 12.0
2018-06-01 14.0
2018-07-01 11.0
2018-08-01 6.0
2018-09-01 10.0
2018-10-01 12.0
2018-11-01 17.0
2018-12-01 8.0
2019-01-01 8.0
2019-02-01 0.0
dtype: float64
![png](https://i-blog.csdnimg.cn/blog_migrate/72909dfd54afcb6bd29e2790f7081b82.png#pic_center)
每月分层用户占比
def active_status(data):
status = []
for i in range(31):
if data[i]==0:
if len(status)==0:
status.append('unreg')
else:
if status[i-1]=='unreg':
status.append('unreg')
else:
status.append('unactive')
else:
if len(status)==0:
status.append('new')
else:
if status[i-1]=='unactive':
status.append('return')
elif status[i-1]=='unreg':
status.append('new')
else:
status.append('active')
return pd.Series(status,pivot_purchase.columns)
pivot_purchase_status =pivot_purchase.apply(active_status,axis=1)
pivot_status_count =pivot_purchase_status.replace('unreg',np.NaN).apply(pd.value_counts)
pivot_status_count.T.plot.area()
<AxesSubplot: xlabel='month'>
![png](https://i-blog.csdnimg.cn/blog_migrate/192b89310f60536d98383bad2390007d.png)
每月不同用户的占比
return_rate = pivot_status_count.apply(lambda x:x/x.sum())
return_rate.T.plot()
<AxesSubplot: xlabel='month'>
![png](https://i-blog.csdnimg.cn/blog_migrate/50efc2720e6f029849765fedf1bdb452.png)