读取数据
导入所需科学计算库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 忽略小警告
import warnings
warnings.filterwarnings('ignore')
读取数据
df = pd.read_csv('D:\数据分析相关笔记等\数据集\电子产品销售分析数据集\电子产品销售分析.csv')
df.head()
Unnamed: 0 | event_time | order_id | product_id | category_id | category_code | brand | price | user_id | age | sex | local | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2020-04-24 11:50:39 UTC | 2294359932054536986 | 1515966223509089906 | 2.268105e+18 | electronics.tablet | samsung | 162.01 | 1.515916e+18 | 24.0 | 女 | 海南 |
1 | 1 | 2020-04-24 11:50:39 UTC | 2294359932054536986 | 1515966223509089906 | 2.268105e+18 | electronics.tablet | samsung | 162.01 | 1.515916e+18 | 24.0 | 女 | 海南 |
2 | 2 | 2020-04-24 14:37:43 UTC | 2294444024058086220 | 2273948319057183658 | 2.268105e+18 | electronics.audio.headphone | huawei | 77.52 | 1.515916e+18 | 38.0 | 女 | 北京 |
3 | 3 | 2020-04-24 14:37:43 UTC | 2294444024058086220 | 2273948319057183658 | 2.268105e+18 | electronics.audio.headphone | huawei | 77.52 | 1.515916e+18 | 38.0 | 女 | 北京 |
4 | 4 | 2020-04-24 19:16:21 UTC | 2294584263154074236 | 2273948316817424439 | 2.268105e+18 | NaN | karcher | 217.57 | 1.515916e+18 | 32.0 | 女 | 广东 |
df['event_time'].unique()
array(['2020-04-24 11:50:39 UTC', '2020-04-24 14:37:43 UTC', '2020-04-24 19:16:21 UTC', ..., '2020-11-21 10:10:01 UTC', '2020-11-21 10:10:13 UTC', '2020-11-21 10:10:30 UTC'], dtype=object)
df['age'].unique()
array([24., 38., 32., 20., 21., 16., 49., 43., 37., 27., 48., 47., 28., 41., 22., 18., 44., 50., 46., 25., 34., 45., 17., 30., 36., 35., 39., 19., 40., 33., 31., 42., 23., 29., 26.])
清洗数据
提取数据时,处理与业务流程不符合数据,售价为负
df[df['price'] < 0]
处理空值
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 564169 entries, 0 to 564168 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 564169 non-null int64 1 event_time 564169 non-null object 2 order_id 564169 non-null int64 3 product_id 564169 non-null int64 4 category_id 564169 non-null float64 5 category_code 434799 non-null object 6 brand 536945 non-null object 7 price 564169 non-null float64 8 user_id 564169 non-null float64 9 age 564169 non-null float64 10 sex 564169 non-null object 11 local 564169 non-null object dtypes: float64(4), int64(3), object(5) memory usage: 51.7+ MB
df.isnull().sum()
Unnamed: 0 0 event_time 0 order_id 0 product_id 0 category_id 0 category_code 129370 brand 27224 price 0 user_id 0 age 0 sex 0 local 0 dtype: int64
# 删除含有空值的行
df = df.dropna(axis = 0, how = 'any')
处理数据类型
# 将'category_id', 'user_id', 'age'改为int64
df[['category_id', 'user_id', 'age']] = df[['category_id', 'user_id', 'age']].astype('int64')
df['event_time'] = pd.to_datetime(df['event_time'])
# 月
df['month'] = df['event_time'].dt.month
# 季
df['quater'] = df['event_time'].dt.to_period('Q') # 参数 M 表示月份,Q 表示季度,A 表示年度,D 表示按天
df.drop(df[df['quater'] == '1970Q1'].index, inplace=True) # 去除异常值
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 419890 entries, 0 to 564168 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 419890 non-null int64 1 event_time 419890 non-null datetime64[ns, UTC] 2 order_id 419890 non-null int64 3 product_id 419890 non-null int64 4 category_id 419890 non-null int64 5 category_code 419890 non-null object 6 brand 419890 non-null object 7 price 419890 non-null float64 8 user_id 419890 non-null int64 9 age 419890 non-null int64 10 sex 419890 non-null object 11 local 419890 non-null object 12 month 419890 non-null int64 13 quater 419890 non-null period[Q-DEC] dtypes: datetime64[ns, UTC](1), float64(1), int64(7), object(4), period[Q-DEC](1) memory usage: 48.1+ MB
处理年龄组
bins = [16, 20, 30, 40, 60]
labels = ['16-19', '20-29', '30-39', '40-60']
df['agegroup'] = pd.cut(df['age'], bins, right = False, labels = labels)
查看数据
df.describe()
Unnamed: 0 | order_id | product_id | category_id | price | user_id | age | month | |
---|---|---|---|---|---|---|---|---|
count | 4.198900e+05 | 4.198900e+05 | 4.198900e+05 | 4.198900e+05 | 419890.000000 | 4.198900e+05 | 419890.000000 | 419890.000000 |
mean | 1.799809e+06 | 2.370509e+18 | 1.676753e+18 | 2.274301e+18 | 254.281619 | 1.515916e+18 | 33.144069 | 7.754607 |
std | 7.529273e+05 | 2.014144e+16 | 3.171234e+17 | 2.438523e+16 | 321.167613 | 2.372070e+07 | 10.119800 | 2.448404 |
min | 0.000000e+00 | 2.294360e+18 | 1.515966e+18 | 2.268105e+18 | 0.000000 | 1.515916e+18 | 16.000000 | 1.000000 |
25% | 1.669817e+06 | 2.354505e+18 | 1.515966e+18 | 2.268105e+18 | 39.330000 | 1.515916e+18 | 24.000000 | 7.000000 |
50% | 1.854132e+06 | 2.375487e+18 | 1.515966e+18 | 2.268105e+18 | 138.870000 | 1.515916e+18 | 33.000000 | 8.000000 |
75% | 2.463694e+06 | 2.388441e+18 | 1.515966e+18 | 2.268105e+18 | 347.200000 | 1.515916e+18 | 42.000000 | 10.000000 |
max | 2.633520e+06 | 2.388441e+18 | 2.388434e+18 | 2.374499e+18 | 11574.050000 | 1.515916e+18 | 50.000000 | 11.000000 |
price的均值在254.28元,中位数在138.87元,可能存在极值影响 年龄的平均值、中位数均在33岁,说明该数据服从正态分布或近似正态分布
数据分析
查看全年销售额的增长率
# 销售额数据分组汇总
sales_month = df.groupby(['month'])['price'].sum()
# 计算2-11月销售增长率
sales_rate_2 = sales_month[2] / sales_month[1] - 1
sales_rate_3 = sales_month[3] / sales_month[2] - 1
sales_rate_4 = sales_month[4] / sales_month[3] - 1
sales_rate_5 = sales_month[5] / sales_month[4] - 1
sales_rate_6 = sales_month[6] / sales_month[5] - 1
sales_rate_7 = sales_month[7] / sales_month[6] - 1
sales_rate_8 = sales_month[8] / sales_month[7] - 1
sales_rate_9 = sales_month[9] / sales_month[8] - 1
sales_rate_10 = sales_month[10] / sales_month[9] - 1
sales_rate_11 = sales_month[11] / sales_month[10] - 1
# 设置2-11月的增长率标签
sales_rate_2_label = '%.2f%%' % (sales_rate_2 * 100)
sales_rate_3_label = '%.2f%%' % (sales_rate_3 * 100)
sales_rate_4_label = '%.2f%%' % (sales_rate_4 * 100)
sales_rate_5_label = '%.2f%%' % (sales_rate_5 * 100)
sales_rate_6_label = '%.2f%%' % (sales_rate_6 * 100)
sales_rate_7_label = '%.2f%%' % (sales_rate_7 * 100)
sales_rate_8_label = '%.2f%%' % (sales_rate_8 * 100)
sales_rate_9_label = '%.2f%%' % (sales_rate_9 * 100)
sales_rate_10_label = '%.2f%%' % (sales_rate_10 * 100)
sales_rate_11_label = '%.2f%%' % (sales_rate_11 * 100)
# 将1-11月的销售额以及增长率转换为DataFrame
sales = pd.DataFrame({
'sales_month':sales_month,
'sales_rate':[0, sales_rate_2, sales_rate_3, sales_rate_4, sales_rate_5, sales_rate_6,
sales_rate_7, sales_rate_8, sales_rate_9, sales_rate_10, sales_rate_11],
'sales_rate_label':['0.00%', sales_rate_2_label, sales_rate_3_label, sales_rate_4_label,
sales_rate_5_label, sales_rate_6_label, sales_rate_7_label,
sales_rate_8_label, sales_rate_9_label, sales_rate_10_label, sales_rate_11_label]
})
# 绘制图形
from matplotlib import ticker
# 设置中文字符
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
# 设置背景样式
plt.style.use('ggplot')
# x轴数据
x = ['{}月'.format(values) for values in sales.index.tolist()]
# y轴数据
y1 = sales['sales_month']
y2 = sales['sales_rate']
# 创建画布
fig = plt.figure(figsize = (20, 8), dpi = 80)
ax1 = fig.add_subplot(111)
ax2 = ax1.twinx()
# 绘制图形
ax1.bar(x, y1, color = 'pink', label = '销售额')
ax2.plot(x, y2, color = 'black',marker = '*', label = '增长率')
ax1.set_xticks(x)
ax1.set_ylim(0, 30000000)
ax2.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1, decimals=1))
ax1.set_xlabel('月份')
ax1.set_ylabel('销售额')
ax2.set_ylabel('增长率')
ax1.set_title("销售额与增长率")
plt.legend()
plt.show()
各省销售情况
各省销售占比
# 按省分类
sales_area = df.groupby('local')['price'].sum()
# 绘图
sales_area.plot(figsize=(20,8),kind = 'pie',autopct='%1.1f%%',title='2020年总销售额占比')
# # 准备数据
# labels = sales_area.index
# size = sales_area.values
# # 设置画布
# plt.figure(figsize = (20, 8), dpi = 80)
# # 饼图属性
# plt.pie(size, labels = labels, autopct = '%.2f%%')
# # 长宽更改
# plt.axis('equal')
# plt.title('2020年总销售额占比', fontsize='x-large')
# plt.legend()
plt.show()
# 各地区每一年的销售额
month_sales_area = df.groupby(by=['local','month'])['price'].sum()
# 将分组后的多层索引转换为列数据
month_sales_area = month_sales_area.reset_index(level=[0, 1])
# 使用数据透视表重新整理数据
month_sales_area = pd.pivot_table(month_sales_area,
index='local',
columns='month',
values='price')
# 绘制图形
month_sales_area.plot(figsize=(20, 8), kind='bar', title='2020年不同省份销售额对比')
plt.show()
各省8月份的销售额均大于其他月份,说明在8月份消费者更容易消费
前四个月的销售额很低迷,商家应针对不同月份退出不同优惠活动
各省份不同类别产品销售比
# 各省份不同类型产品的销售额
category_sales_area = df.groupby(by = ['local','category_code'])['price'].sum().reset_index().sort_values('price',ascending = False).head(50)
category_sales_area = category_sales_area.groupby(by=['local','category_code'])['price'].sum()
# 将分组后的多层索引设置为列数据
category_sales_area = category_sales_area.reset_index()
# 数据透视表整理数据
category_sales_area = pd.pivot_table(category_sales_area,
index='local',
columns='category_code',
values='price')
# 绘制图形
category_sales_area.plot(figsize=(20, 8), kind ='bar', title='不同类别产品T50在各省份销售对比')
plt.show()