import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
from tabulate import tabulate
数理统计基础
数据样本分析从分析数据样本分布开始
数据分布分析
样本
Name | Salary |
---|---|
Dan | 50,000 |
Joann | 54,000 |
Pedro | 50,000 |
Rosie | 189,000 |
Ethan | 55,000 |
Vicky | 40,000 |
Frederic | 59,000 |
均值
x ˉ = ∑ i = 1 n x i n \bar{x} = \frac{\displaystyle\sum_{i=1}^{n}x_{i}}{n} xˉ=ni=1∑nxi
x ˉ = 50000 + 54000 + 50000 + 189000 + 55000 + 40000 + 59000 7 = 71000 \bar{x} = \frac{50000+54000+50000+189000+55000+40000+59000}{7} = 71000 xˉ=750000+54000+50000+189000+55000+40000+59000=71000
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000]})
df['Salary'].mean()
71000.0
中值
把数据进行排序
Salary |
---|
40,000 |
50,000 |
50,000 |
54,000 |
55,000 |
59,000 |
189,000 |
取中间的位置 n + 1 2 = 4 {\frac{n+1}{2} = 4} 2n+1=4
Salary |
---|
40,000 |
50,000 |
50,000 |
>54,000 |
55,000 |
59,000 |
189,000 |
如果是奇数个元素,则
n 2 a n d n 2 + 1 \frac{n}{2} \;\;\;\;and \;\;\;\; \frac{n}{2} + 1 2nand2n+1
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000]})
df['Salary'].median()
54000.0
众数
如果选出现频率最高的数字时
Salary |
---|
40,000 |
>50,000 |
>50,000 |
54,000 |
55,000 |
59,000 |
189,000 |
应该是50,000*
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000]})
df['Salary'].mode()
0 50000
dtype: int64
多个众数
很显然,众数可能有多个
Salary |
---|
40,000 |
>50,000 |
>50,000 |
54,000 |
>59,000 |
>59,000 |
189,000 |
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,59000,40000,59000]})
df['Salary'].mode()
0 50000
1 59000
dtype: int64
分布密度
找到中心点之后,希望获得更多关于分布形状的信息。
先从最大值和最小值开始:
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000]})
s = df['Salary']
print(tabulate([['min','mode','media','mean','max'],
[s.min(),s.mode()[0],s.median(),s.mean(),s.max()]]))
----- ----- ------- ------- ------
min mode media mean max
40000 50000 54000.0 71000.0 189000
----- ----- ------- ------- ------
可视化
%matplotlib inline
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000]})
salary = df['Salary']
salary.plot.hist(title='Salary Distribution', color='lightblue', bins=25)
plt.axvline(salary.mean(), color='magenta', linestyle='dashed', linewidth=2)
plt.axvline(salary.median(), color='green', linestyle='dashed', linewidth=2)
plt.show()
均值 和 中值 用绿色和紫色虚线标出:
- 直方图的高度是对应的样本数
- 可见,大部分样本在40000和55000之间
- 均值高于中值
直方图给出了分布密度的信息,在上图基础上绘制密度曲线:
高斯概率密度估计文档
%matplotlib inline
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000]})
salary = df['Salary']
density = stats.gaussian_kde(salary)
n, x, _ = plt.hist(salary, histtype='step', density=True, bins=25)
plt.plot(x, density(x)*5)
plt.axvline(salary.mean(), color='magenta', linestyle='dashed', linewidth=2)
plt.axvline(salary.median(), color='green', linestyle='dashed', linewidth=2)
plt.show()
概率密度曲线的形状并不对称
-
峰值在左边,右边有一个长尾
因为右侧的值把均值给拉向右边,所以叫右偏态分布 -
再看一个左偏态分布的例子
Name | Hours |
---|---|
Dan | 41 |
Joann | 40 |
Pedro | 36 |
Rosie | 30 |
Ethan | 35 |
Vicky | 39 |
Frederic | 40 |
%matplotlib inline
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Hours':[41,40,36,30,35,39,40]})
hours = df['Hours']
density = stats.gaussian_kde(hours)
n, x, _ = plt.hist(hours, histtype='step', normed=True, bins=25)
plt.plot(x, density(x)*7)
plt.axvline(hours.mean(), color='magenta', linestyle='dashed', linewidth=2)
plt.axvline(hours.median(), color='green', linestyle='dashed', linewidth=2)
plt.show()
- 下面是一个正态分布的例子
Name | Grade |
---|---|
Dan | 50 |
Joann | 50 |
Pedro | 46 |
Rosie | 95 |
Ethan | 50 |
Vicky | 5 |
Frederic | 57 |
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Grade':[50,50,46,95,50,5,57]})
grade = df['Grade']
density = stats.gaussian_kde(grade)
n, x, _ = plt.hist(grade, histtype='step', normed=True, bins=25)
plt.plot(x, density(x)*7.5)
plt.axvline(grade.mean(), color='magenta', linestyle='dashed', linewidth=2)
plt.axvline(grade.median(), color='green', linestyle='dashed', linewidth=2)
plt.show()
一个对称的钟形曲线 The 均值, 中值, 和众值重合.
这就是最著名的正态分布(高斯分布)
偏度和峰度
通过测量这两个指标获取关于数据分布形状的认识:
%matplotlib inline
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,30,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
numcols = ['Salary', 'Hours', 'Grade']
for col in numcols:
print(df[col].name + ' skewness: ' + str(df[col].skew()))
print(df[col].name + ' kurtosis: ' + str(df[col].kurt()))
density = stats.gaussian_kde(df[col])
n, x, _ = plt.hist(df[col], histtype='step', density=True, bins=25)
plt.plot(x, density(x)*6)
plt.show()
print('\n')
Salary skewness: 2.57316410755049
Salary kurtosis: 6.719828837773431
Hours skewness: -1.194570307262883
Hours kurtosis: 0.9412265624999989
Grade skewness: -0.06512433009682762
Grade kurtosis: 2.7484764913773034
测量变化
范围
先用最大值减最小值
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,30,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
numcols = ['Salary', 'Hours', 'Grade']
for col in numcols:
print(df[col].name + ' range: ' + str(df[col].max() - df[col].min()))
Salary range: 149000
Hours range: 11
Grade range: 90
百分比 四分位数
百分比
例如
5 7 × 100 ≈ 71.4 \frac{5}{7} \times 100 \approx 71.4 75×100≈71.4
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,30,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
print(stats.percentileofscore(df['Grade'], 57, 'strict')) # 和下面的weak比较
71.42857142857143
根据是否包含被统计的样本本身,还有另外一种算法
6 7 × 100 ≈ 85.7 \frac{6}{7} \times 100 \approx 85.7 76×100≈85.7
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,30,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
print(stats.percentileofscore(df['Grade'], 57, 'weak'))
85.71428571428571
如果要测量50分组的等级,50分有三个样本,把其中一个归类到高分组,把另一个归类到低分组。
( 4 7 ) × 100 ≈ 57.14 (\frac{4}{7}) \times 100 \approx 57.14 (74)×100≈57.14
这样可以算出50分组的等级
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,30,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
print(stats.percentileofscore(df['Grade'], 50, 'rank')) # 单独计算得分组里面的一个
print(stats.percentileofscore(df['Grade'], 50, 'weak'))
57.142857142857146
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,30,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
print(tabulate([['strict','weak','rank'],
[stats.percentileofscore(df['Grade'], 50, 'strict'),
stats.percentileofscore(df['Grade'], 50, 'weak'),
stats.percentileofscore(df['Grade'], 50, 'rank')]]))
71.42857142857143
----------------- ----------------- ------------------
strict weak rank
28.57142857142857 71.42857142857143 57.142857142857146
----------------- ----------------- ------------------
四分位数
以25%,50%,75%为界,把所有样本划分到4个组里:
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,17,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
df['Hours'].quantile([0.25, 0.5, 0.75])
0.25 35.5
0.50 39.0
0.75 40.0
Name: Hours, dtype: float64
%matplotlib inline
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,30,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
df['Hours'].plot(kind='box', title='Weekly Hours Distribution', figsize=(10,8))
plt.show()
箱线图
- 矩形(箱)代表 1 4 → 1 2 + 1 2 → 3 4 {\frac{1}{4} \to \frac{1}{2} + \frac{1}{2} \to \frac{3}{4}} 41→21+21→43 两部分
- 箱上、下的线分别代表 0 → 1 4 {0 \to \frac{1}{4}} 0→41 和 3 4 → 1 {\frac{3}{4} \to 1} 43→1区间
- 中间的绿线代表中值
例外
例外往往格外引起注意
%matplotlib inline
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,30,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
df['Salary'].plot(kind='box', title='Salary Distribution', figsize=(10,8))
plt.show()
解释
例外是距离中值点太远的反常数据。例外总需要解释,
- 检查数据源有没有错
- 排除例外
- 接受例外,接受反常是存在的
%matplotlib inline
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,17,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
df['Salary'].plot(kind='box', title='Salary Distribution', figsize=(10,8), showfliers=False)
plt.show()
%matplotlib inline
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,17,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
# 考试分数的巨大差别是可以解释的
df['Grade'].plot(kind='box', title='Grade Distribution', figsize=(10,8))
plt.show()
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic', 'Jimmie', 'Rhonda', 'Giovanni', 'Francesca', 'Rajab', 'Naiyana', 'Kian', 'Jenny'],
'Grade':[50,50,46,95,50,5,57,42,26,72,78,60,40,17,85]})
# 人多了,“例外”就不突出了
df['Grade'].plot(kind='box', title='Grade Distribution', figsize=(10,8))
plt.show()
方差和标准差
方差
方差测量偏离期望的程度
σ 2 = ∑ i = 1 N ( X i − μ ) 2 N \sigma^{2} = \frac{\displaystyle\sum_{i=1}^{N} (X_{i} -\mu)^{2}}{N} σ2=Ni=1∑N(Xi−μ)2
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,17,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
df['Grade'].var()
685.6190476190476
标准差
σ = ∑ i = 1 N ( X i − μ ) 2 N \sigma = \sqrt{\frac{\displaystyle\sum_{i=1}^{N} (X_{i} -\mu)^{2}}{N}} σ=Ni=1∑N(Xi−μ)2
- 平方可以去掉负值
- 平方根可以统一量纲
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,17,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
df['Grade'].std()
26.184328282754315
正态分布的标准差
正态分布是最重要的分布,标准差是一个正态分布不可缺少的测量指标:期望、方差。
%matplotlib inline
Udf = pd.DataFrame(np.random.randn(100000, 1), columns=['Grade'])
grade = df['Grade']
density = stats.gaussian_kde(grade)
n, x, _ = plt.hist(grade, color='lightgrey', density=True, bins=100)
plt.plot(x, density(x))
s = df['Grade'].std()
m = df['Grade'].mean()
x1 = [m-s, m+s] # 1 𝛔
y1 = [0.25, 0.25]
plt.plot(x1,y1, color='magenta')
plt.annotate('1s (68.26%)', (x1[1],y1[1]))
x2 = [m-(s*2), m+(s*2)] # 2 𝛔
y2 = [0.05, 0.05]
plt.plot(x2,y2, color='green')
plt.annotate('2s (95.45%)', (x2[1],y2[1]))
x3 = [m-(s*3), m+(s*3)] # 3 𝛔
y3 = [0.005, 0.005]
plt.plot(x3,y3, color='orange')
plt.annotate('3s (99.73%)', (x3[1],y3[1]))
plt.axvline(grade.mean(), color='grey', linestyle='dashed', linewidth=1)
plt.show()
百分之六十八,九十五和九十九
看起来很难记住,其实很容易
归一化
Z = x − μ σ Z = \frac{x - \mu}{\sigma} Z=σx−μ
比如
Z = 95 − 50.43 26.184 = 1.702 Z = \frac{95 - 50.43}{26.184} = 1.702 Z=26.18495−50.43=1.702
总结数据分布
函数名:describe
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
'Salary':[50000,54000,50000,189000,55000,40000,59000],
'Hours':[41,40,36,17,35,39,40],
'Grade':[50,50,46,95,50,5,57]})
print(df.describe())
Salary Hours Grade
count 7.000000 7.000000 7.000000
mean 71000.000000 35.428571 50.428571
std 52370.475143 8.423324 26.184328
min 40000.000000 17.000000 5.000000
25% 50000.000000 35.500000 48.000000
50% 54000.000000 39.000000 50.000000
75% 57000.000000 40.000000 53.500000
max 189000.000000 41.000000 95.000000