使用正态分布确定极端GMV值所对应的日期
1、查看GMV分布直方图
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
pwd_path = "/Users/gao/Desktop/Code/"
gmv_excp_4day = os.path.join(pwd_path,"gmv_excp_32days.csv")
# load gmv data
gmv2 = pd.read_csv(gmv_excp_4day)
gmv2.head()
gmv2020_index = gmv2[gmv2['year']==2020].index
plt.hist(gmv2.loc[gmv2020_index]['gmv'], 100, alpha=0.5, label='2020-gmv_excp_32days')
plt.legend(loc='upper right')
plt.xlabel('gmv')
plt.ylabel('Frequency')
plt.title('Histogram of gmv')
plt.show() #符合正态分布
2、筛选出2个标准差以上的日期GMV
ind = gmv2020_index
gmv2.loc[ind].describe()
mu = gmv2.loc[ind]['gmv'].mean()
sigma = gmv2.loc[ind]['gmv'].std()
bigthanXsigma_index = gmv2.loc[ind][((gmv2.loc[ind]['gmv']-mu)/sigma)>=2].index
gmv2.loc[bigthanXsigma_index]