概念
正态分布(Normal distribution),也称“常态分布”,又名高斯分布(Gaussian distribution),最早由棣莫弗(Abraham de Moivre)在求二项分布的渐近公式中得到。C.F.高斯在研究测量误差时从另一个角度导出了它。P.S.拉普拉斯和高斯研究了它的性质。是一个在数学、物理及工程等领域都非常重要的概率分布,在统计学的许多方面有着重大的影响力。
正态曲线呈钟型,两头低,中间高,左右对称因其曲线呈钟形,因此人们又经常称之为钟形曲线。
若随机变量X服从一个数学期望为μ、方差为σ2的正态分布,记为N(μ,σ2)。其概率密度函数为正态分布的期望值μ决定了其位置,其标准差σ决定了分布的幅度。当μ = 0,σ = 1时的正态分布是标准正态分布。
默认参数正太分布
# 加载功能包
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.style as style
from IPython.core.display import HTML
# 指定大小
%matplotlib inline
style.use('fivethirtyeight')
plt.rcParams['figure.figsize']=(14,7)
plt.figure(dpi=100)
# PDF 概率密度函数
plt.plot(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100))/ np.max(stats.norm.pdf(np.linspace(-3,3,100))),)
# linspace选择取值;norm为正态分布
# plt.fill_between(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100))/ np.max(stats.norm.pdf(np.linspace(-3,3,100))),alpha=45,)
# 进行图像填充
# CDF累计概率密度函数
plt.plot(np.linspace(-4,4,100),stats.norm.cdf(np.linspace(-4,4,100)),)
# 补充图设置:LEGEND、TICKS与TITLE
plt.text(x=-1.5, y=0.7, s="PDF(normed)", rotation=65, alpha = 75, weight="bold", color="#008fd5")
plt.text(x=0.5, y=0.5, s="CDF", rotation=55, alpha = 75, weight="bold", color="#fc4f30")
plt.tick_params(axis = 'both', which ='major', labelsize = 18)
plt.axhline(y = 0, color ='black', linewidth = 1.3, alpha = 7)
plt.text(x = -5, y = 1.25, s = "Normal Distribution - Overview", fontsize = 26, weight = 'bold', alpha = 75)
输出:
参数设置
均值
plt.figure(dpi=100)
# PDF MU=0
plt.plot(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100))/ np.max(stats.norm.pdf(np.linspace(-3,3,100))),)
plt.fill_between(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100))/ np.max(stats.norm.pdf(np.linspace(-3,3,100))),alpha=45,)
# PDF MU=2
plt.plot(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100),loc=2)/ np.max(stats.norm.pdf(np.linspace(-3,3,100))))
plt.fill_between(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100),loc=2)/ np.max(stats.norm.pdf(np.linspace(-3,3,100))),alpha=45,)
# PDF MU=-2
plt.plot(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100),loc=-2)/ np.max(stats.norm.pdf(np.linspace(-3,3,100))))
plt.fill_between(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100),loc=-2)/ np.max(stats.norm.pdf(np.linspace(-3,3,100))),alpha=45,)
# LEGEND TICKS TITLE
plt.text(x=-1.5, y=0.65, s="$ \mu=0 $", rotation=65, alpha = 75, weight="bold", color="#008fd5")
plt.text(x=0.5, y=0.65, s="$ \mu=2 $", rotation=65, alpha = 75, weight="bold", color="#fc4f30")
plt.text(x=-3.5, y=0.65, s="$ \mu=-2 $", rotation=65, alpha = 75, weight="bold", color="#e5ae38")
plt.tick_params(axis = 'both', which ='major', labelsize = 18)
plt.axhline(y = 0, color ='black', linewidth = 1.3, alpha = 7)
plt.text(x = -5, y = 1.25, s = "Normal Distribution - $ \mu $", fontsize = 26, weight = 'bold', alpha = 75)
输出:
标准差
plt.figure(dpi=100)
# PDF SIGMA=1
plt.plot(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100),scale=1)/ np.max(stats.norm.pdf(np.linspace(-3,3,100))),)
plt.fill_between(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100),scale=1)/ np.max(stats.norm.pdf(np.linspace(-3,3,100))),alpha=75,)
# PDF SIGMA=2
plt.plot(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100),scale=2)/ np.max(stats.norm.pdf(np.linspace(-3,3,100))))
plt.fill_between(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100),scale=2)/ np.max(stats.norm.pdf(np.linspace(-3,3,100))),alpha=75,)
# PDF SIGMA=0.5
plt.plot(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100),scale=0.5)/ np.max(stats.norm.pdf(np.linspace(-3,3,100))))
plt.fill_between(np.linspace(-4,4,100),stats.norm.pdf(np.linspace(-4,4,100),scale=0.5)/ np.max(stats.norm.pdf(np.linspace(-3,3,100))),alpha=75,)
# LEGEND TICKS TITLE
plt.text(x=-1.5, y=0.65, s="$ \sigma=1 $", rotation=65, alpha = 75, weight="bold", color="#008fd5")
plt.text(x=2.5, y=0.5, s="$ \sigma=2 $", rotation=65, alpha = 75, weight="bold", color="#fc4f30")
plt.text(x=0.5, y=1.65, s="$ \sigma=0.5 $", rotation=65, alpha = 75, weight="bold", color="#e5ae38")
plt.tick_params(axis = 'both', which ='major', labelsize = 18)
plt.axhline(y = 0, color ='black', linewidth = 1.3, alpha = 7)
plt.text(x = -5, y = 2.15, s = "Normal Distribution - $ \sigma $", fontsize = 26, weight = 'bold', alpha = 5)
随机的几个样本
可以使用norm.rvs()其中的默认值,也可以自己制定
from scipy.stats import norm
# draw a single sample
print(norm.rvs(),end="\n\n")
# draw 10 samples
print(norm.rvs(size=10), end="\n\n")
# adjust mean ('loc') and standard deviation ('scale')
# 调整平均值('loc')和标准差('scale')
print(norm.rvs(loc=10, scale=0.1), end="\n\n")
概率密度函数
根据密度分布函数来计算
from scipy.stats import norm
# additional imoprts for plotting purpose
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (14,7)
# relative likelihood of x and y
x=-1
y=2
print("pdf(x)={}\npdf(y)={}".format(norm.pdf(x),norm.pdf(y)))
# continuous pdf for the plot
x_s = np.linspace(-3,3,50)
y_s = norm.pdf(x_s)
plt.scatter(x_s, y_s);
累计概率密度
from scipy.stats import norm
# probabolity of x less or equal 0.3
print("P(X<0.3)={}".format(norm.cdf(0.3)))
# probability of x in [-0.2, +0.2]
print("P(-0.2<X<0.2)={}".format(norm.cdf(0.2)-norm.cdf(-0.2)))
基于数据画出分布
plt.figure(dpi=100)
# declaring the "true" parameters underlying the sample
mu_real = 10 #实际均值
sigma_real=2 #实际标准差
# draw a sample of n=1000
np.random.seed(42)
sample = stats.norm.rvs(loc = mu_real,scale=sigma_real,size=1000) #正态分布并找了1000个点
# estimate mu and sigma
mu_est=np.mean(sample) #计算的均值
sigma_est=np.std(sample) #计算的标准差
print("Estimated MU: {}\nEstimated SIGMA: {}".format(mu_est, sigma_est))
# sample distribution
plt.hist(sample, bins=50,normed=True, alpha=.25) #基于算的结果画出柱形图
# true curve
plt.plot(np.linspace(2, 18, 1000), norm.pdf(np.linspace(2,18,1000),loc=mu_real,scale=sigma_real)) #真实的正态分布曲线
# estimated curve
plt.plot(np.linspace(2, 18, 1000), norm.pdf(np.linspace(2,18,1000),loc=mu_est,scale=sigma_est)) #建立出来的正太分布曲线
# legend
plt.text(x=9.5, y=0.1, s="sample", alpha = 75, weight="bold", color="#008fd5")
plt.text(x=7, y=0.15, s="true distrubtion", rotation=65, alpha = 75, weight="bold", color="#fc4f30")
plt.text(x=5, y=0.05, s="estimated distribution", rotation=65, alpha = 75, weight="bold", color="#e5ae38")
# ticks
plt.tick_params(axis='both',which='major',labelsize=18)
plt.axhline(y=0,color='black',linewidth=1.3,alpha=7)
# title
plt.text(x=0,y=0.3,s="Normal Distribution",fontsize=26,weight='bold',alpha=75)
Estimated MU: 10.038664111644652
Estimated SIGMA: 1.9574524154947084