#加载数据分析需要的库
#加载数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('./order.csv')
df.head()
#查看数据大概分布
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104557 entries, 0 to 104556
Data columns (total 11 columns):
id 104557 non-null int64
orderId 104557 non-null int64
userId 104557 non-null int64
productId 104557 non-null int64
cityId 104557 non-null int64
price 104557 non-null int64
payMoney 104557 non-null int64
channelId 104549 non-null object
deviceType 104557 non-null int64
createTime 104557 non-null object
payTime 104557 non-null object
dtypes: int64(8), object(3)
memory usage: 8.8+ MB
df.describe()
id orderId userId productId cityId price payMoney deviceType
count 104557.000000 1.045570e+05 1.045570e+05 104557.000000 104557.000000 1.045570e+05 1.045570e+05 104557.000000
mean 52279.000000 2.993341e+08 3.270527e+06 504.566275 154410.947225 9.167350e+04 8.686689e+04 2.385292
std 30183.150385 5.149818e+07 4.138208e+07 288.130647 72197.163762 9.158836e+04 9.072028e+04 0.648472
min 1.000000 1.035627e+08 2.930600e+04 0.000000 30000.000000 6.000000e+02 -1.000000e+03 1.000000
25% 26140.000000 2.633627e+08 2.179538e+06 254.000000 100011.000000 3.790000e+04 3.360000e+04 2.000000
50% 52279.000000 2.989828e+08 2.705995e+06 507.000000 150001.000000 5.920000e+04 5.500000e+04 2.000000
75% 78418.000000 3.349972e+08 3.271237e+06 758.000000 220002.000000 1.080000e+05 1.040000e+05 3.000000
max 104557.000000 4.871430e+08 3.072939e+09 1000.000000 380001.000000 2.295600e+06 2.294200e+06 6.000000
#开始清理数据
df.orderId.unique().size
df.orderId.size
#数据有重复值最后处理,删除可能会影响其他列的数据
#productId最小值为0,查看一下数据
df[df['productId'] == 0]
#177条数据对整体数据影响大不,最后进行删除处理
#查看price是否有空值并且将price单位为分转为元
df[df['price'].isnull()]
df['price'] = df['price'] / 100
#payMoney最小值为负数需要处理
df[df['payMoney']<0]
#删除小于0的值
df.drop(df[df['payMoney']<0].index,inplace=True)
df['payMoney'] = df['payMoney'] / 100
#查看channelId空值,删除空值
df[df['channelId'].isnull()]
df.drop(df[df['channelId'].isnull()].index,inplace=True)
#将createTime,payTime转换格式
df['createTime'] = pd.to_datetime(df['createTime'])
df['payTime'] = pd.to_datetime(df['payTime'])
#本次数据分析统计2016的数据
import datetime
startTime = datetime.datetime(2016,1,1)
endTime = datetime.datetime(2016,12,31,23,59,59)
#查看2016之前and2016年之后的数据并删除
df[df['createTime']<startTime]
df.drop(df[df['createTime']<startTime].index,inplace=True)
df[df['createTime']>endTime]
df.drop(df[df['createTime']>endTime].index,inplace=True)
df[df['payTime']<startTime]
#删除orderId重复值
df.drop(df[df['orderId'].duplicated()].index,inplace=True)
#删除productId == 0的值
df.drop(df[df['productId '] == 0].index,inplace=True)
#数据清洗完毕,开始分析
#分析数据可以从数据的维度和指标来考虑,一个维度可以分析多个指标
#分析商品销量的前10and后十个
productId_orderId = df.groupby('productId').count()['orderId'].sort_values(ascending=False)
productId_orderId.head(10)
productId
895 354
762 350
103 334
587 303
385 302
60 301
38 301
403 297
345 292
823 288
Name: orderId, dtype: int64
productId_orderId.tail(10)
productId
948 29
856 28
621 27
272 26
563 24
347 21
597 19
468 18
986 16
1000 13
Name: orderId, dtype: int64
#查询2016年商品销售额
productId_payMoney = df.groupby('productId').sum()['payMoney'].sort_values(ascending=False)
#商品销售额前10and后十个
productId_payMoney.head(10)
productId
385 42752210
61 36157200
103 34464120
405 33952500
720 32240510
345 32016220
698 31845860
182 29660000
383 28079000
396 26955600
Name: payMoney, dtype: int64
productId_payMoney.tail(10)
productId
310 2287900
847 2286900
454 2253500
817 2250900
597 2184700
408 1811100
986 1478400
964 1423800
1000 1216900
347 1207000
Name: payMoney, dtype: int64
#查看销量后100and销售额后100的交集,如果销量和销售额都不行,可以考虑下架
productId_tail = productId_orderId.tail(100).index.intersection(productId_payMoney.tail(100).index)
productId_tail
Int64Index([ 847, 352, 806, 7, 145, 807, 460, 314, 318, 551, 586,
599, 137, 577, 569, 91, 676, 874, 582, 303, 817, 27,
868, 855, 590, 359, 242, 408, 218, 392, 964, 14, 104,
985, 436, 629, 247, 220, 578, 579, 859, 958, 454, 227,
310, 469, 16, 277, 478, 528, 948, 856, 621, 272, 347,
597, 468, 986, 1000],
dtype='int64', name='productId')
#分析各城市商品销量和销售额
cityId_orderId = df.groupby('cityId').count()['orderId'].sort_values(ascending=False)
cityId_orderId
cityId_payMoney = df.groupby('cityId').count()['payMoney'].sort_values(ascending=False)
cityId_payMoney
#分析什么价格的商品销量最高,把价格区间设置为100元并画图
bin = np.arange(100,25000,100)
bins = pd.cut(df['price'],bin)
count_price = df['price'].groupby(bins).count()
count_price.plot()
#按1000分桶分析
bin = np.arange(0,25000,1000)
bins = pd.cut(df['price'],bin)
count_price = df['price'].groupby(bins).count()
count_price.plot()
#根据下单时间分析
#创建小时维度的列
df['hour'] = df['createTime'].dt.hour
df.groupby('hour').count()['orderId'].plot()
#中午12-14点下单率最高,其次是晚上8点
#根据周几分析
#创建周几维度的列
df['dayofweek'] = df.createTime.dt.dayofweek
df.groupby('dayofweek').count()['orderId'].plot()
#一周之中周四周五周六下单最多
#商品推广可以根据下单率高的时间来进行
#分析用户创建订单后多久支付
df['paydate']=(df['payTime'] - df['createTime'])
df['paydate']
#大部分用户支付时间为十多分钟,说明用户购买目的明确,购买意愿强烈
电商2016年交易数据分析
最新推荐文章于 2024-07-15 18:38:33 发布