电商交易数据分析
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('./order_info_2016.csv', index_col='id')
df.head()
df.describe()
device_type = pd.read_csv('./device_type.txt')
device_type
df.info()
df.orderId.unique().size
df.orderId.size
df.userId.unique().size
df.productId[(df.productId == 0)].size
df.cityId.unique().size
df.price = df.price / 100
df[df.payMoney < 0]
df.drop(index=df[df.payMoney < 0].index, inplace=True)
df[df.payMoney < 0].index
df.payMoney = df.payMoney / 100
df[df.channelId.isnull()]
df.drop(index=df[df.channelId.isnull()].index, inplace=True)
df[df.channelId.isnull()]
df.createTime = pd.to_datetime(df.createTime)
df.payTime = pd.to_datetime(df.payTime)
df.dtypes
import datetime
startTime = datetime.datetime(2016, 1, 1)
endTime = datetime.datetime(2016, 12, 31, 23, 59, 59)
df[df.createTime < startTime]
df.drop(index=df[df.createTime < startTime].index, inplace=True)
df[df.createTime < startTime]
df.drop(index=df[df.createTime > df.payTime].index, inplace=True)
df[df.createTime > endTime]
df[df.payTime < startTime]
df.orderId.unique().size
df.orderId.size
df.drop(index=df[df.orderId.duplicated()].index, inplace=True)
df.orderId.unique().size
df.drop(index=df[df.productId==0].index, inplace=True)
print(df.orderId.count())
print(df.userId.unique().size)
print(df.payMoney.sum()/100)
print(df.productId.unique().size)
productId_orderCount = df.groupby('productId').count()
['orderId'].sort_values(ascending=False)
print(productId_orderCount.head(10))
print(productId_orderCount.tail(10))
productId_turnover = df.groupby('productId').sum()
['payMoney'].sort_values(ascending=False)
print(productId_turnover.head(10))
print(productId_turnover.tail(10))
problem_productIds =
productId_turnover.tail(100).index.intersection(productId_orderCount.tail(100).index)
cityId_orderCount = df.groupby('cityId').count()
['orderId'].sort_values(ascending=False)
cityId_payMoney = df.groupby('cityId').sum()
['payMoney'].sort_values(ascending=False)
bins = np.arange(0, 25000, 100)
pd.cut(df.price, bins).value_counts()
plt.figure(figsize=(16, 16))
plt.hist(df['price'], bins)
price_cut_count = pd.cut(df.price, bins).value_counts()
zero_cut_result = (price_cut_count == 0)
zero_cut_result[zero_cut_result.values].index
bins = np.arange(0, 25000, 1000)
price_cut = pd.cut(df.price, bins).value_counts()
m = plt.pie(x=price_cut.values, labels=price_cut.index, autopct='%d%%',
shadow=True)
df['orderHour'] = df.createTime.dt.hour
df.groupby('orderHour').count()['orderId'].plot()
df['orderWeek'] = df.createTime.dt.dayofweek
df.groupby('orderWeek').count()['orderId']
def get_seconds(x):
return x.total_seconds()
df['payDelta'] = (df['payTime'] - df['createTime']).apply(get_seconds)
bins = [0, 50, 100, 1000, 10000, 100000]
pd.cut(df.payDelta, bins).value_counts()
pd.cut(df.payDelta, bins).value_counts().plot(kind='pie', autopct='%d%%',
shadow=True, figsize=(10, 10))
df.set_index('createTime', inplace=True)
turnover = df.resample('M').sum()['payMoney']
order_count = df.resample('M').count()['orderId']
turnover.plot()