目录
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
df=pd.read_excel(r'D:\机器学习\数据分析项目\e-commerce-data-analysis-master\e-commerce-data-analysis-master\电商订单数据分析\某电商网站订单数据.xlsx',index_col='id')
数据探索和数据清洗
数据探索
display(df,df.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 104557 entries, 1 to 104557
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 orderID 104557 non-null object
1 userID 104557 non-null object
2 goodsID 104557 non-null object
3 orderAmount 104557 non-null float64
4 payment 104557 non-null float64
5 chanelID 104549 non-null object
6 platfromType 104557 non-null object
7 orderTime 104557 non-null datetime64[ns]
8 payTime 104557 non-null datetime64[ns]
9 chargeback 104557 non-null object
dtypes: datetime64[ns](2), float64(2), object(6)
memory usage: 8.8+ MB
orderID | userID | goodsID | orderAmount | payment | chanelID | platfromType | orderTime | payTime | chargeback | |
---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||
1 | sys-2018-254118088 | user-157213 | PR000064 | 272.51 | 272.51 | 渠道-0396 | APP | 2018-02-14 12:20:36 | 2019-02-28 13:38:41 | 否 |
2 | sys-2018-263312190 | user-191121 | PR000583 | 337.93 | 337.93 | 渠道-0765 | Wech atMP | 2018-08-14 09:40:34 | 2019-01-01 14:47:14 | 是 |
3 | sys-2018-188208169 | user-211918 | PR000082 | 905.68 | 891.23 | 渠道-0530 | We c hatMP | 2018-11-02 20:17:25 | 2019-01-19 20:06:35 | 否 |
4 | sys-2018-203314910 | user-201322 | PR000302 | 786.27 | 688.88 | 渠道-0530 | WEB | 2018-11-19 10:36:39 | 2019-08-07 12:24:35 | 否 |
5 | sys-2018-283989279 | user-120872 | PR000290 | 550.77 | 542.51 | 渠道-9527 | APP | 2018-12-26 11:19:16 | 2019-10-01 07:42:43 | 否 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
104553 | sys-2020-268392025 | user-182189 | PR000082 | 186.37 | 126.21 | 渠道-0568 | Wech atMP | 2020-01-01 23:30:57 | 2020-01-01 23:31:09 | 否 |
104554 | sys-2020-213140521 | user-170057 | PR000603 | 138.81 | 138.81 | 渠道-0007 | APP | 2020-01-01 23:31:26 | 2020-01-01 23:31:36 | 否 |
104555 | sys-2020-274536228 | user-156592 | PR000405 | 2632.27 | 2632.27 | 渠道-0985 | Wech atMP | 2020-01-01 23:46:56 | 2020-01-01 23:47:06 | 否 |
104556 | sys-2020-279922239 | user-173702 | PR000387 | 1206.76 | 1029.46 | 渠道-0985 | APP | 2020-01-01 23:47:01 | 2020-01-01 23:47:48 | 否 |
104557 | sys-2020-250738010 | user-164299 | PR000525 | 3053.70 | 2982.26 | 渠道-0007 | APP | 2020-01-01 23:48:26 | 2020-01-01 23:49:44 | 否 |
104557 rows × 10 columns
数据清洗
# 提取2019年的订单数据
# 处理业务流程不符的数据(支付时间早于下单时间、支付时长超过30分钟、订单金额小于0、支付金额小于0)
# 处理渠道为空的数据(补充众数)
# 处理平台类型字段(去掉多余空格,保持数据一致)
# 添加折扣字段,处理折扣大于1的字段(将支付金额改为‘订单金额’ * ‘平均折扣’)
#去重和取2019年数据
df=df.drop_duplicates('orderID')
df.loc[:,'orderyear']=df.loc[:,'orderTime'].apply(lambda x :str(x).split('-')[0])
df=df.loc[df['orderyear']=='2019',:]
df.shape
(104269, 11)
#处理业务流程不符的数据
df=df[df['orderTime']<=df['payTime']]
df=df[pd.to_timedelta(df['payTime']-df['orderTime'])<='0 days 00:30:00']
df=df[df['orderAmount']>=0]
#df=df[(df.payTime - df.orderTime).dt.seconds<=1800]
df=df[df['payment']>=0]
df.shape
(103321, 11)
#处理渠道为空的数据(补众数)
df['chanelID']=df['chanelID'].fillna(df['chanelID'].mode())
#处理平台类型字段(去掉多余空格,保持数据一致)备注:文本型数据很容易不一致
df['platfromType']=df['platfromType'].apply(lambda x :x.replace(' ',""))
#添加折扣字段,处理折扣大于1的字段(如果折扣大于1,则取平均折扣)(将支付金额改为‘订单金额’*‘平均折扣’)
df['discount']=df['payment']/df['orderAmount']
df['discount']=df['discount'].apply(lambda x :x if x<=1 else df['discount'].mean())
df['payment']=df['orderAmount']*df['discount']
维度分析
# 交易总金额(gmv),总销售额,实%save销售额,退货率,客单价
# 每月gmv及趋势分析
# 流量渠道来源分析
# 周一到周日那天的下单量最高,每天哪个时间段下单量最高
# 用户复购率
交易总金额(gmv),总销售额,实际销售额,退货率,客单价
gmv=df['orderAmount'].sum()
total_payment=df['payment'].sum()
real_payment=df.loc[df['chargeback']=='否','payment'].sum()
chargeback_rate=df.loc[df['chargeback']=='是','chargeback'].count()/df.shape[0]
user_count=df['userID'].nunique()
arppu=real_payment/user_count
l1=[gmv,total_payment,real_payment,chargeback_rate,user_count,arppu]
l2=['gmv','total_payment','real_payment','chargeback_rate','user_count','arppu']
z=dict(zip(l2,l1))
z
{'gmv': 108495929.7,
'total_payment': 102942061.44413847,
'real_payment': 89215591.25223792,
'chargeback_rate': 0.13173507805770365,
'user_count': 78634,
'arppu': 1134.5676329862135}
每月的gmv趋势分析
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
df['month']=df['orderTime'].dt.month
gmv_m=df.groupby('month').agg({'orderAmount':'sum'})/10000
pay_m=df.groupby('month').agg({'payment':'sum'})/10000
real_m=df[df['chargeback']=='否'].groupby('month').agg({'payment':'sum'})/10000
plt.figure(figsize=(6,4),dpi=120)
plt.plot(gmv_m,label='每月交易额')
plt.plot(pay_m,label='每月销售额')
plt.plot(real_m,label='每月实际销售额')
plt.title('每月交易额和实际销售额')
plt.xlabel("月份")
plt.ylabel('金额(万元)')
plt.grid(axis='y',linestyle='--',alpha=0.5)
for i in range(0,12):
plt.text(i+0.5,real_m.iloc[i,0]-20,np.round(real_m.iloc[i,0],2))
plt.legend()
plt.show()
小结:二月的gmv最低,是gmv最大值对应月份(11月)1/2。
渠道来源分析
gmv_c=df.groupby('chanelID').agg({'orderAmount':'sum'})
gmv_c_10=gmv_c.nlargest(10,columns='orderAmount')
plt.figure(figsize=(6,6),dpi=120)
plt.pie(gmv_c_10['orderAmount'],labels=gmv_c_10.index
,autopct='%.2f%%',pctdistance=0.8,
wedgeprops={'width':0.35
, 'linewidth':1
,'edgecolor':'white'})
plt.title('渠道VS销售额')
plt.show()
#结论:渠道-0896,渠道-9527,渠道-0530位列销售额的前3
小结:
- 渠道-0896的销售额最高,占所有渠道的18.78%,前5个渠道的gmv占比57.27%
工作日对销量影响
df['week']=df['orderTime'].dt.weekday
c_w=df.groupby('week').agg({'orderID':'count'})
plt.plot(c_w['orderID'])
plt.title('星期VS下单量')
plt.xticks([0,1,2,3,4,5,6],['七','一','二','三','四','五','六'])
plt.show()
小结:
- 周末以及周末前后的gmv显著高于工作日
时段对销量影响
df['time']=df['orderTime'].dt.floor('30T').dt.time
t_c=df.groupby('time').agg({'orderID':'count'})
plt.figure(figsize=(12,4))
plt.plot(range(t_c.shape[0]),t_c['orderID'])
plt.xticks(range(t_c.shape[0]),t_c.index,rotation=90)
plt.title('时段VS销量')
plt.show()
小结:
- 每天在13:00和20:00会迎来来个小高峰,随之下降的一个趋势,在17:00和凌晨到达最低点
用户复购率统计分析
temp=pd.pivot_table(data=df,index=['userID'],columns='month',values=['orderID'],aggfunc='count')
#对复购的记为1,购买一次为0,每月购买空
def handle_data(x):
return 1 if x>1 else 0 if x==1 else np.nan
temp=temp.applymap(handle_data)#applymap是针对每个元素
repurchase_rate=temp.sum()/temp.count()*100
repurchase_rate=pd.DataFrame(repurchase_rate).reset_index()
repurchase_rate=repurchase_rate[['month',0]]
repurchase_rate.columns=['month','count']
plt.figure(figsize=(12,4))
plt.plot(repurchase_rate['count'])
plt.title('月份VS复购率')
plt.show()
小结:
- 用户复购率分为4月前和4月后,4月前的复购率在2左右,4月后的用户复购率在3左右
rfm模型分析
#只分析购买且没有退货的永华
df_c=df[df['chargeback']=='否']
df_c.loc[:,'F']=1
df_c.head()
orderID | userID | goodsID | orderAmount | payment | chanelID | platfromType | orderTime | payTime | chargeback | orderyear | discount | month | week | time | F | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||||||
6 | sys-2019-279103297 | user-146548 | PR000564 | 425.20 | 425.20 | 渠道-0765 | WechatMP | 2019-01-01 00:12:23 | 2019-01-01 00:13:37 | 否 | 2019 | 1.000000 | 1 | 1 | 00:00:00 | 1 |
7 | sys-2019-316686066 | user-104210 | PR000709 | 1764.37 | 1707.04 | 渠道-0396 | WechatMP | 2019-01-01 00:23:06 | 2019-01-01 00:23:32 | 否 | 2019 | 0.967507 | 1 | 1 | 00:00:00 | 1 |
8 | sys-2019-306447069 | user-104863 | PR000499 | 499.41 | 480.42 | 渠道-0007 | WechatMP | 2019-01-01 01:05:50 | 2019-01-01 01:06:17 | 否 | 2019 | 0.961975 | 1 | 1 | 01:00:00 | 1 |
9 | sys-2019-290267674 | user-206155 | PR000253 | 1103.00 | 1050.95 | 渠道-0330 | APP | 2019-01-01 01:16:12 | 2019-01-01 01:16:25 | 否 | 2019 | 0.952811 | 1 | 1 | 01:00:00 | 1 |
10 | sys-2019-337079027 | user-137939 | PR000768 | 465.41 | 465.41 | 渠道-9527 | ALiMP | 2019-01-01 01:31:00 | 2019-01-01 01:31:36 | 否 | 2019 | 1.000000 | 1 | 1 | 01:30:00 | 1 |
res_df=df_c.pivot_table(index='userID',values=['orderTime','F','orderAmount'],aggfunc={
'orderTime':max,
'F':sum,
'orderAmount':sum
})
res_df
F | orderAmount | orderTime | |
---|---|---|---|
userID | |||
user-100000 | 1 | 1978.47 | 2019-10-13 18:46:46 |
user-100003 | 1 | 521.60 | 2019-05-24 13:04:05 |
user-100006 | 1 | 466.89 | 2019-11-14 15:37:19 |
user-100007 | 1 | 2178.20 | 2019-01-14 18:45:35 |
user-100008 | 1 | 4949.65 | 2019-11-16 17:15:03 |
... | ... | ... | ... |
user-299980 | 1 | 441.71 | 2019-10-18 10:53:37 |
user-299983 | 1 | 706.80 | 2019-12-27 17:57:11 |
user-299989 | 2 | 1685.18 | 2019-11-11 10:40:08 |
user-299992 | 1 | 508.75 | 2019-01-01 16:14:47 |
user-299995 | 1 | 479.94 | 2019-03-30 16:35:12 |
70592 rows × 3 columns
a=pd.to_datetime('2019-12-31 23:59:59')
res_df['R']=res_df['orderTime'].apply(lambda x :a-x).dt.days
res_df['M']=res_df['orderAmount']
res_df=res_df[['R','F','M']]
res_df
R | F | M | |
---|---|---|---|
userID | |||
user-100000 | 79 | 1 | 1978.47 |
user-100003 | 221 | 1 | 521.60 |
user-100006 | 47 | 1 | 466.89 |
user-100007 | 351 | 1 | 2178.20 |
user-100008 | 45 | 1 | 4949.65 |
... | ... | ... | ... |
user-299980 | 74 | 1 | 441.71 |
user-299983 | 4 | 1 | 706.80 |
user-299989 | 50 | 2 | 1685.18 |
user-299992 | 364 | 1 | 508.75 |
user-299995 | 276 | 1 | 479.94 |
70592 rows × 3 columns
res_df['F'].value_counts()
temp_df=res_df.apply(lambda x :x-x.mean())
rfm_model=temp_df.applymap(lambda x :'1' if x>=0 else '0')
rfm_model
R | F | M | |
---|---|---|---|
userID | |||
user-100000 | 0 | 0 | 1 |
user-100003 | 1 | 0 | 0 |
user-100006 | 0 | 0 | 0 |
user-100007 | 1 | 0 | 1 |
user-100008 | 0 | 0 | 1 |
... | ... | ... | ... |
user-299980 | 0 | 0 | 0 |
user-299983 | 0 | 0 | 0 |
user-299989 | 0 | 1 | 1 |
user-299992 | 1 | 0 | 0 |
user-299995 | 1 | 0 | 0 |
70592 rows × 3 columns
#定义映射函数
tage_dict={
'111':'重要价值用户'
,'101':'重要发展用户'
,'011':'重要保持用户'
,'001':'重要挽留用户'
,'110':'一般价值用户'
,'100':'一般发展用户'
,'010':'一般保持用户'
,'000':'一般挽留用户'
}
def make_tag(model):
key=model['R']+model['F']+model['M']
return tage_dict[key]
rfm_model['label']=rfm_model.apply(make_tag,axis=1)
rfm_model
R | F | M | label | |
---|---|---|---|---|
userID | ||||
user-100000 | 0 | 0 | 1 | 重要挽留用户 |
user-100003 | 1 | 0 | 0 | 一般发展用户 |
user-100006 | 0 | 0 | 0 | 一般挽留用户 |
user-100007 | 1 | 0 | 1 | 重要发展用户 |
user-100008 | 0 | 0 | 1 | 重要挽留用户 |
... | ... | ... | ... | ... |
user-299980 | 0 | 0 | 0 | 一般挽留用户 |
user-299983 | 0 | 0 | 0 | 一般挽留用户 |
user-299989 | 0 | 1 | 1 | 重要保持用户 |
user-299992 | 1 | 0 | 0 | 一般发展用户 |
user-299995 | 1 | 0 | 0 | 一般发展用户 |
70592 rows × 4 columns
plt.figure(figsize=(12,4))
rfm=rfm_model.groupby('label').agg({'label':'count'})
rfm.columns=['count']
rfm['pct']=rfm['count']/rfm['count'].sum()
plt.bar(rfm.index,rfm['pct'],color=np.random.rand(7, 3))
plt.title('RFM模型确定用户类型',fontsize=20)
for i in range(rfm.shape[0]):
plt.text(rfm.index[i],rfm.iloc[i,1],np.round(rfm.iloc[i,1],2),fontsize=15)
plt.xticks(fontsize=12)
plt.show()
小结:
- 从rfm模型中可以看出,商店的重要价值用户占比为4%,一般发展用户占比最高为32%