电商网站数据处理(加强版)
一、订单数据分析
提取2019年的订单数据
处理业务流程不符的数据(支付时间早于下单时间、支付时长超过30分钟、订单金额小于0、支付金额小于0)
处理渠道为空的数据(补充众数)
处理平台类型字段(去掉多余的空格,保持数据一致)
添加折扣字段,处理折扣大于1的字段(将支付金额修改为“订单金额*平均折扣”)
交易总金额(GMV)、总销售额、实际销售额、退货率、客单价
每月GMV及趋势分析(折线图)
流量渠道来源拆解GMV占比(饼图)
用户复购率分析
import numpy as np
import pandas as pd
# 0.从Excel文件中读取订单数据
order_df = pd.read_excel(r'C:\Users\wby\Desktop\某电商网站订单数据.xlsx', index_col='id')
order_df
order_df.info()
# 1.提取2019年的订单数据
order_df = order_df[order_df.orderID.str.contains('2019')]
# 2.处理业务流程不符的数据(支付时间早于下单时间、支付时长超过30分钟、订单金额小于0、支付金额小于0)。
order_df = order_df[(order_df['orderTime']<=order_df['payTime']) & ((order_df['payTime'] - order_df['orderTime']).dt.total_seconds() <= 1800) & (order_df['orderAmount']>=0) & (order_df['payment']>=0)]
order_df
order_df = order_df.rename(columns={'chanelID':'channelID', 'platfromType':'platformType'})
order_df
# 3. 处理渠道为空的数据(补充众数)
order_df['channelID'] = order_df['channelID'].fillna(order_df['channelID'].mode())
# 4. 处理平台类型字段(去掉多余的空格,保持数据一致)
# df2['platformType'] = df2['platformType'].apply(lambda x: x.replace(' ',''))
order_df.platformType.str.replace(r'\s', '', regex=True).str.upper()
# 5. 添加折扣字段,处理折扣大于1的字段(将支付金额修改为“订单金额*平均折扣”)
# order_df['discount'] = order_df['payment'] / order_df['orderAmount']
# discount_mean = order_df[order_df.discount <=1].discount.mean()
# temp_df = order_df[order_df['discount']>1]
# temp_df['discount'] = discount_mean
# temp_df['payment'] = temp_df['orderAmount'] * discount_mean
# order_df[order_df['discount']>1] = temp_df
# order_df
order_df['discount'] = np.round(order_df.payment / order_df.orderAmount, 2)
mean_discount = round(order_df[order_df.discount <= 1].discount.mean(), 2)
order_df['payment'] = order_df.payment.where(
order_df.discount <= 1, np.round(order_df.orderAmount * mean_discount, 2)
)
order_df[order_df.discount > 1]
# 6. 交易总金额(GMV)、总销售额、实际销售额、退货率、客单价(ARPPU)
gmv = order_df.orderAmount.sum()
total_amount = order_df.payment.sum()
total_payment = order_df[order_df.chargeback == '否'].payment.sum()
back_rate = order_df[order_df.chargeback == '是'].orderID.count() / order_df.orderID.count()
arppu = total_payment / order_df.userID.nunique()
print(f'GMV: {gmv / 10000:.2f}万元')
print(f'总销售额: {total_amount / 10000:.2f}万元')
print(f&#