#1.加载工具包
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#加载优衣库文件
UNIQLO=pd.read_csv('L2W1.csv')
#清理数据,描述性语句查看缺失值,去掉销售额为负数的结果
UNIQLO.head()
| store_id | city | channel | gender_group | age_group | wkd_ind |
product | customer | revenue | order | quant | unit_cost |
---|---|---|---|---|---|
0 | 658 | 深圳 | 线下 | Female | 25-29 |
796.0 | 4 | 4 | 59 | ||
1 | 146 | 杭州 | 线下 | Female | 25-29 |
149.0 | 1 | 1 | 49 | ||
2 | 70 | 深圳 | 线下 | Male | >=60 |
2 | 2 | 49 | |||
3 | 658 | 深圳 | 线下 | Female | 25-29 |
1 | 1 | 49 | |||
4 | 229 | 深圳 | 线下 | Male | 20-24 |
2 | 3 | 9 |
UNIQLO.info()#无缺失值
UNIQLO.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22293 entries, 0 to 22292
Data columns (total 12 columns):
store_id 22293 non-null int64
city 22293 non-null object
channel 22293 non-null object
gender_group 22293 non-null object
age_group 22293 non-null object
wkd_ind 22293 non-null object
product 22293 non-null object
customer 22293 non-null int64
revenue 22293 non-null float64
order 22293 non-null int64
quant 22293 non-null int64
unit_cost 22293 non-null int64
dtypes: float64(1), int64(5), object(6)
memory usage: 2.0+ MB
store_id | customer | revenue | order | quant | unit_cost |
---|---|---|---|---|---|
count | 22293.000000 | 22293.000000 | 22293.000000 | 22293.000000 | |
22293.000000 | 22293.000000 | ||||
mean | 335.391558 | 1.629480 | 159.531371 | 1.651998 | 1.858072 |
46.124658
std | 230.236167 | 1.785605 | 276.254066 | 1.861480 | 2.347301 |
19.124347
min | 19.000000 | 1.000000 | -0.660000 | 1.000000 | 1.000000 |
9.000000
25% | 142.000000 | 1.000000 | 64.000000 | 1.000000 | 1.000000 |
49.000000
50% | 315.000000 | 1.000000 | 99.000000 | 1.000000 | 1.000000 |
49.000000
75% | 480.000000 | 2.000000 | 175.000000 | 2.000000 | 2.000000 |
49.000000
max | 831.000000 | 58.000000 | 12538.000000 | 65.000000 | 84.000000
| 99.000000
UNIQLO1 = UNIQLO[UNIQLO['revenue']>0]
UNIQLO1.describe()
store_id | customer | revenue | order | quant | unit_cost |
---|---|---|---|---|---|
count | 22262.000000 | 22262.000000 | 22262.000000 | 22262.000000 | |
22262.000000 | 22262.000000 | ||||
mean | 335.486614 | 1.630357 | 159.753549 | 1.652906 | 1.859222 |
46.127841
std | 230.371454 | 1.786694 | 276.382135 | 1.862617 | 2.348723 |
19.120825
min | 19.000000 | 1.000000 | 10.000000 | 1.000000 | 1.000000 |
9.000000
25% | 142.000000 | 1.000000 | 66.000000 | 1.000000 | 1.000000 |
49.000000
50% | 315.000000 | 1.000000 | 99.000000 | 1.000000 | 1.000000 |
49.000000
75% | 480.000000 | 2.000000 | 175.000000 | 2.000000 | 2.000000 |
49.000000
max | 831.000000 | 58.000000 | 12538.000000 | 65.000000 | 84.000000
| 99.000000
问题一:整体销售情况随着时间的变化是怎样的?
题目拆解:
数据中与时间有关的字段仅为类别变量wkd_ind代表的Weekday和Weekend,即购买发生的时间是周中还是周末。本题意为分析对比周末和周中与销售有关的数据,包括产品销售数量quant、销售金额revenue、顾客人数customer的情况,可生成柱状图进行可视化。