import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
import warnings
warnings.filterwarnings("ignore")
df = pd.read_excel('data/2020年销售数据.xlsx')
df.head()
df.info()
1.统计月度销售额
date = pd.to_datetime(df['销售日期'])
df['month'] = date.dt.month
df
df.groupby('month')['销售额'].sum()
pd.pivot_table(
df,
columns='month',
values='销售额',
aggfunc='sum'
).fillna(0).applymap(int)
2.统计品牌销售占比
temp = pd.pivot_table(
df,
index='品牌',
columns='销售区域',
values='销售额',
aggfunc='sum',
margins=True,
margins_name='总计'
).fillna(0).applymap(int)
temp
ser = df.groupby('品牌')['销售额'].sum()
ser.plot(
kind='pie',
autopct='%.1f%%',
pctdistance=0.8,
wedgeprops={
'edgecolor': 'white',
'width': 0.4
}
)
plt.plot?
df.groupby(['销售区域', 'month'])['销售额'].sum()
3.统计各地区的月度销售额
pd.pivot_table(
df,
index='month',
columns='销售区域',
values='销售额',
aggfunc='sum'
).fillna(0).applymap(int)
4.统计各渠道的品牌销量
temp = pd.pivot_table(
df,
index='销售渠道',
columns='品牌',
values='销售数量',
aggfunc='sum',
margins='True',
margins_name='总计'
)
temp
temp.loc[['京东', '天猫', '实体', '抖音', '拼多多']]\
.plot(
kind='bar',
y=['八匹马', '啊哟喂', '壁虎', '皮皮虾', '花花姑娘']
)
plt.xticks(rotation=0)
plt.show()
temp.size
temp.drop('总计').plot(
kind='bar',
y=['八匹马', '啊哟喂', '壁虎', '皮皮虾', '花花姑娘']
)
plt.xticks(rotation=0)
plt.show()
df['售价'].describe()
5.统计不同售价区间的月度销量占比
bins = np.arange(0, 1501, 300)
box = pd.cut(df['售价'], bins)
temp = df.pivot_table(index=box, columns='month', values='销售数量', aggfunc=np.sum)
temp
temp.div(temp.sum(axis=0), axis=1)\
.applymap(lambda x: np.round(x * 100, 2))\
.applymap('{}%'.format)
df = pd.read_csv('data/2018年北京积分落户数据.csv', index_col='id')
df.info()
df['birthday'] = pd.to_datetime(df.birthday)
df
df.info()
from datetime import datetime
df['age'] = (datetime(2018, 7, 1) - df.birthday).dt.days // 365
df
df.age.min(), df.age.max()
bins = np.arange(30, 61, 5)
box = pd.cut(df.age, bins, right=False)
ser = df.groupby(box).name.count()
ser
ser.plot(kind='bar', color=['r', 'g', 'b'])
plt.xticks(rotation=0)
for i in range(ser.size):
plt.text(i, ser[i]+30, ser[i], ha='center')
plt.show()
df.info()
df['score'].max(), df['score'].min()
按照分数段分箱
bins = np.arange(90, 125, 5)
box = pd.cut(df.score, bins, right=False)
ser = df.groupby(box).name.count()
ser
plt.text?
ser.plot(kind='bar', color=['r', 'g', 'b'])
plt.xticks(rotation=0)
for i in range(len(ser)):
plt.text(i, ser[i] + 30, ser[i], ha='center')
plt.show()
df.company.replace('北京华为数字技术有限公司', '华为技术有限公司北京研究所', inplace=True)
df.company.value_counts()[:20]