第四章 多元正太总体的推断统计
课堂代码整理及注释
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mayavi.mlab import *
#### ---------------------------------------------
#### 单个总体_协方差阵未知_均值向量的检验
#### ---------------------------------------------
## 例 4.2.1
dat = pd.read_clipboard() ##可以直接复制剪切板数据
dat = pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp4.2.1.xlsx")
x_bar = dat.mean()
# H0:
u0 = [90,58,16] #H0
n = dat.shape[0] # .shape[0] 读出有多少行 即有多少个样本 n
p = dat.shape[1] # .shape[1] 读出有多少列 即有多少个因素 p
n , p = dat.shape # 利用 tuple 赋值特点
S = dat.cov() # 样本协方差
# 构造检验统计量 T2 T^2 = n * (x_bar-u0)' * S^(-1) * (x_bar-u0)
T2 = n*(x_bar-u0).T.dot(np.linalg.inv(S)).dot(x_bar-u0) ; T2
F_value =(n-p)/((n-1)*p)*T2 ; F_value # 求出对应的 F_value
# 插曲 画出 F(p,n-p) 的概率密度曲线
rv_f = stats.f(p,n-p)
xs= np.linspace(0,10,101)
ys= rv_f.pdf(xs)
plt.plot(xs,ys)
alpha = 0.01 #显著性水平
F_crit = rv_f.ppf(1-alpha) ; F_crit ## 概率密度为1-alpha的时候对应的x的值
# 结果 F_value > F_ctri 落在拒绝域内
P_value = 1 - rv_f.cdf((n-p)/((n-1)*p)*T2) ; P_value ##计算概率 P 值可以得到同样的结果
#### ---------------------------------------------
#### 两总体_协方差相等_未知_均值向量的检验
#### ---------------------------------------------
## 例 4.3.1
dat= pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp4.3.1.xlsx")
X1 = dat.iloc[0:6,0:3] # 取出男婴样本
X2 = dat.iloc[6:,0:3] # 取出女婴样本
# X2.index = ['0','1','2','3','4','5','6','7','8']
n1 = X1.shape[0] # 行数即样本数
n2 = X2.shape[0]
p = X1.shape[1] # 列数即因素个数
X1_bar = X1.mean()
X2_bar = X2.mean()
Z_bar = X1_bar-X2_bar
S1 = X1.cov()
S2 = X2.cov()
Sp = (n1-1)/(n1+n2-2)*S1 + (n2-1)/(n1+n2-2)*S2 # 协方差的联合无偏估计
T2 = n1*n2/(n1+n2)*Z_bar.T.dot(np.linalg.inv(Sp)).dot(Z_bar)
F_value = ((n1+n2-2)-p+1)/((n1+n2-2)*p) * T2
#服从F(3,11)分布 F(p,n1+n2-p-1)
alpha = 0.05
rv_f = stats.f(3,11)
F_ctri = rv_f.ppf(1-alpha)
#此时F_value < F_ctri,落在接收域内,接受H0,认为均值无差异
P_value = 1 - rv_f.cdf(F_value) ; P_value ##计算概率 P 值可以得到同样的结果
#### ---------------------------------------------
#### 两总体_协方差阵不等_样本容量不同_均值向量的检验
#### ---------------------------------------------
## 例 4.3.1
X1_n = X1
X2_n = X2.iloc[:6,]
X2_n.sum()
X2.sum()
Z = pd.DataFrame(X1_n.values - (n1/n2)**0.5 * X2_n.values,columns=['x1','x2','x3']) + (n1*n2)**(-0.5) * X2_n.sum() - (1/n2) * X2.sum()
# Z 的注释 检验统计量的构造见 PPT 第四章 P39
# X1_n.values : 将DataFrame转换为array
# Z 中包含 DataFrame 和 array 的运算过于复杂 需要结合运行结果分析
Z_bar = Z.mean()
Sz = Z.cov()
T2 = 6 * Z_bar.T.dot(np.linalg.inv(Sz).dot(Z_bar)) ; T2
F_value = 3/15 * T2 ;F_value
#Q3:F_value = (n1-p)*n1/p * T2
alpha = 0.05
rv_f = stats.f(3,3)
F_ctri = rv_f.ppf(1-alpha)
# F_value < F_ctri 落在接收域内
#### ---------------------------------------------
#### 多个总体_协方差矩阵相等性的检验
#### ---------------------------------------------
## BOX M 统计量
## 4.5.1
dat= pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp4.5.1.xlsx")
X1 = dat.iloc[0:20,0:4] # 取出销售方案 1
X2 = dat.iloc[20:40,0:4] # 取出销售方案 2
X3 = dat.iloc[40:,0:4] # 取出销售方案 3
S1 = X1.cov()
S2 = X2.cov()
S3 = X3.cov()
n1 = n2 = n3 = 20
n = n1 + n2 +n3
p = 4
k = 3
Sp = 1/57 * (19*S1 + 19*S2 +19*S3)
## BOX M 统计量
M = 57*np.log(np.linalg.det(Sp)) - 19*(np.log(np.linalg.det(S1)) + np.log(np.linalg.det(S2)) + np.log(np.linalg.det(S3)))
#### ---------------------------------------------
#### 插曲: 验证下 F 和 t 的值
#### ---------------------------------------------
stats.t.ppf(0.975,19)**2
stats.f.ppf(0.95,1,19)
#### ---------------------------------------------
#### 置信区域 和 联合置信区间
#### ---------------------------------------------
#### ---------------------------------------------
## 2 维
## 4.2.2
#### ---------------------------------------------
# 二元变量 p=2
# confidence region
def conf(n, x_bar, u, S):
return n*(x_bar-u).T.dot(np.linalg.inv(S)).dot(x_bar-u)
dat=pd.read_excel('F:\\基础数学课\\应用多元统计分析\\examp4.2.2.xlsx')
dat_x_bar = dat.mean()
dat_S = dat.cov()
dat_n = dat.shape[0]
dat_p = dat.shape[1]
# confidence region
def dat_conf(u):
return conf(dat_n , dat_x_bar , u , dat_S)
# S 是用离差阵还是cov? 样本方差阵
dat_conf([1,3]) ## 示例
#### ---------------------------------------------
#### 置信区域
#### ---------------------------------------------
U1,U2 = np.mgrid[40:110:101j,50:110:101j] ### 等价于下面的 np.linspace() + meshgrid()
u1s = np.linspace(40,110,101)
u2s = np.linspace(50,110,101)
U1 , U2 = np.meshgrid(u1s,u2s)
U = np.stack((np.ravel(U1),np.ravel(U2))).T ## help(np.stack)
Z = np.apply_along_axis(dat_conf,1,U).reshape(U1.shape) ## help(np.apply_along_axis)
##注解: numpy.apply_along_axis(func, axis, arr, *args, **kwargs)
# 2.作用:
# 将arr数组的每一个元素经过func函数变换形成的一个新数组
# 3.参数介绍:
# 其中func,axis,arr是必选的
# func是我们写的一个函数
# axis表示函数func对arr是作用于行还是列
# arr便是我们要进行操作的数组了
# 可选参数:*args, **kwargs。都是func()函数额外的参数
fig=plt.figure()
ax=fig.gca(projection="3d")
ax.plot_surface(U1,U1,Z)
plt.show()
plt.contour(U1,U2,Z)
#改变画布大小
plt.ylim(0,200)
plt.xlim(0,200)
rv_f = stats.f(dat_p,dat_n-dat_p)
F_crit = rv_f.ppf(0.90)
T2_crit = F_crit * (dat_p)*(dat_n-1)/(dat_n-dat_p)
ct1 = plt.contour(U1,U2,Z,[T2_crit]) ## help(plt.contour)
plt.ylim(50,100)
plt.xlim(50,100)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")
#### ---------------------------------------------
#### 联合置信区间
#### ---------------------------------------------
u1_min = dat_x_bar[0] - T2_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u1_max = dat_x_bar[0] + T2_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u2_min = dat_x_bar[1] - T2_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
u2_max = dat_x_bar[1] + T2_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
ct1 = plt.contour(U1,U2,Z,[T2_crit])
plt.ylim(50,100)
plt.xlim(50,100)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")
plt.hlines((u2_min,u2_max),u1_min,u1_max)
plt.vlines((u1_min,u1_max),u2_min,u2_max)
#### ---------------------------------------------
#### Bonferroni 联合置信区间
#### ---------------------------------------------
rv_t = stats.t(dat_n-1)
alpha = 0.05
k = 2
tk_crit = rv_t.ppf(1-alpha/(2*k))
u1_min_b = dat_x_bar[0] - tk_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u1_max_b = dat_x_bar[0] + tk_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u2_min_b = dat_x_bar[1] - tk_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
u2_max_b = dat_x_bar[1] + tk_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
#### ---------------------------------------------
#### 联合置信区间 和 Bonferroni 联合置信区间
#### ---------------------------------------------
ct1 = plt.contour(U1,U2,Z,[T2_crit])
plt.ylim(50,100)
plt.xlim(50,100)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")
plt.hlines((u2_min,u2_max),u1_min,u1_max)
plt.vlines((u1_min,u1_max),u2_min,u2_max)
plt.hlines((u2_min_b,u2_max_b),u1_min_b,u1_max_b,'red')
plt.vlines((u1_min_b,u1_max_b),u2_min_b,u2_max_b,'red')
#### ---------------------------------------------
## 3 维
# 三元变量 p=3
#### ---------------------------------------------
def conf(n, x_bar, u, S):
return n*(x_bar-u).T.dot(np.linalg.inv(S)).dot(x_bar-u)
dat=pd.read_excel('F:\\基础数学课\\应用多元统计分析\\examp4.2.1.xlsx')
dat_x_bar = dat.mean()
dat_S = dat.cov()
dat_n = dat.shape[0]
dat_p = dat.shape[1]
def dat_conf(u):
return conf(dat_n , dat_x_bar , u , dat_S)
dat_conf([1,1,1])
U1,U2,U3 = np.mgrid[64:100:51j,54:66:51j,10:19:51j] ## 等价于下面的 np.linspace() + meshgrid()
u1s = np.linspace(40,110,101)
u2s = np.linspace(50,110,101)
U1 , U2 = np.meshgrid(u1s,u2s)
U = np.stack((np.ravel(U1),np.ravel(U2),np.ravel(U3))).T
Z = np.apply_along_axis(dat_conf,1,U).reshape(U1.shape)
rv_f = stats.f(dat_p,dat_n-dat_p)
F_crit = rv_f.ppf(0.95)
T2_crit = F_crit * (dat_p)*(dat_n-1)/(dat_n-dat_p)
contour3d(Z , contours=[T2_crit],transparent = True)
outline()
axes()
show()
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = Axes3D(fig)
ax.plot_surface(U1, U2, Z)
plt.show()
其中部分函数的参数说明整理
关于scipy.stats库的基础(关于概率分布)
1.scipy.stats库中各分布对应的方法
from scipy import stats
# 正态分布
stats.norm
# 卡方分布
stats.chi2
# t分布
stats.t
# F分布
stats.f
2.stats库中各分布的常用方法及其功能
- 对于正态分布:
stats.norm.cdf(α,均值,方差);
stats.norm.pdf(α,均值,方差);
stats.norm.isf(α,均值,方差);
- 对于t分布:
stats.t.cdf(α,自由度);
stats.t.pdf(α,自由度);
stats.t.isf(α,自由度);
- 对于F分布:
stats.f.cdf(α,自由度1,自由度2);
stats.f.pdf(α,自由度1,自由度2);
stats.f.isf(α,自由度1,自由度2);
一个简单的案例说明:
# 对于正态分布
stats.norm.cdf(0.5,2,3)
stats.norm.pdf(0.5,2,3)
stats.norm.isf(0.05,2,3)
# 对于t分布
stats.t.cdf(0.5,10)
stats.t.pdf(0.5,10)
stats.t.isf(0.0005,45)
以norm分布举例来计算偏度和峰度
import scipy.stats as st
st.norm.cdf(0)
st.norm(3,1).cdf(3)
st.norm.cdf(3,3,1)
st.norm.ppf(0.5,3,1)
>>> import scipy.stats as st
>>> a = [89, 23, 45, 18]
>>> st.skew(a) # 计算偏度
0.7565543738808015
>>> st.kurtosis(a) # 计算峰度
-1.0489580648783101
>>> import pandas as pd
>>> import numpy as np
>>> df = pd.DataFrame(np.array([[85, 68, 90], [82, 63, 88], [84, 90, 78]]), columns=['统计学', '高数', '英语'], index=['张三', '李四', '王五'])
>>> df
统计学 高数 英语
张三 85 68 90
李四 82 63 88
王五 84 90 78
>>> df.iloc[1, :].skew() # 计算第二行的偏度
-1.3294040702410526
>>> df.skew(axis = 0) # 计算所有列的偏度
统计学 -0.935220
高数 1.498959
英语 -1.545393
dtype: float64
>>> df.skew(axis = 1) # 计算所有行的偏度
张三 -1.373033
李四 -1.329404
王五 0.000000
dtype: float64
>>> df1 = pd.DataFrame(np.array([[85, 68, 90, 65], [82, 63, 88, 83], [84, 90, 78, 90], [72, 68, 91, 84]]), columns=['统计学', '高数', '英语', '计算机'], index=['张三', '李四', '王五', '马六'])
>>> df1
统计学 高数 英语 计算机
张三 85 68 90 65
李四 82 63 88 83
王五 84 90 78 90
马六 72 68 91 84
>>> df1.kurt(axis = 0) # 计算 df1 所有列的偏度
统计学 3.090874
高数 3.365664
英语 3.090874
计算机 2.769386
dtype: float64
>>> df1.iloc[:, 2].kurt() #计算 df1 第 3 列的偏度
3.090874188966101
3.概率密度函数及其图象
1. 正太分布
x = np.linspace(-5,5,100000)
y = stats.norm.pdf(x,0,1)
plt.plot(x,y,c="red")
plt.title('正态分布的概率密度函数')
plt.tight_layout()
plt.savefig("正态分布的概率密度函数",dpi=300)
2. 卡方分布
x = np.linspace(0,100,100000)
color = ["blue","green","darkgrey","darkblue","orange"]
for i in range(10,51,10):
y=stats.chi2.pdf(x,df=i)
plt.plot(x,y,c=color[int((i-10)/10)])
plt.title('卡方分布')
plt.tight_layout()
plt.savefig(" 布的概率密度函数",dpi=300)
3. t分布
x = np.linspace(-5,5,100000)
y = stats.t.pdf(x_t,2)
plt.plot(x,y,c="orange")
plt.title('t分布的概率密度函数')
plt.tight_layout()
plt.savefig("t分布的概率密度函数",dpi=300)
4. python绘制t分布和正态分布的概率密度函数对比图
x_norm = np.linspace(-5,5,100000)
y_norm = stats.norm.pdf(x_norm,0,1)
plt.plot(x_norm,y_norm,c="black")
color = ["green","darkblue","orange"]
x_t = np.linspace(-5,5,100000)
for i in range(1,4,1):
y_t = stats.t.pdf(x_t,i)
plt.plot(x_t,y_t,c=color[int(i-1)])
plt.title('t分布和正态分布的概率密度函数对比图')
plt.tight_layout()
plt.savefig("t分布和正态分布的概率密度函数对比图",dpi=300)
5. F分布
x = np.linspace(-1,8,100000)
y1 = stats.f.pdf(x,1,10)
y2 = stats.f.pdf(x,5,10)
y3 = stats.f.pdf(x,10,10)
plt.plot(x,y1)
plt.plot(x,y2)
plt.plot(x,y3)
plt.ylim(0,1)
plt.title('F分布的概率密度函数')
plt.tight_layout()
plt.savefig("F分布的概率密度函数",dpi=300)
np.stack()
系统中的解释 help(np.stack)
help(np.stack)
Help on function stack in module numpy:
stack(arrays, axis=0, out=None)
Join a sequence of arrays along a new axis.
The ``axis`` parameter specifies the index of the new axis in the
dimensions of the result. For example, if ``axis=0`` it will be the first
dimension and if ``axis=-1`` it will be the last dimension.
.. versionadded:: 1.10.0
Parameters
----------
arrays : sequence of array_like
Each array must have the same shape.
axis : int, optional
The axis in the result array along which the input arrays are stacked.
out : ndarray, optional
If provided, the destination to place the result. The shape must be
correct, matching that of what stack would have returned if no
out argument were specified.
Returns
-------
stacked : ndarray
The stacked array has one more dimension than the input arrays.
See Also
--------
concatenate : Join a sequence of arrays along an existing axis.
block : Assemble an nd-array from nested lists of blocks.
split : Split array into a list of multiple sub-arrays of equal size.
Examples
--------
>>> arrays = [np.random.randn(3, 4) for _ in range(10)]
>>> np.stack(arrays, axis=0).shape
(10, 3, 4)
>>> np.stack(arrays, axis=1).shape
(3, 10, 4)
>>> np.stack(arrays, axis=2).shape
(3, 4, 10)
>>> a = np.array([1, 2, 3])
>>> b = np.array([2, 3, 4])
>>> np.stack((a, b))
array([[1, 2, 3],
[2, 3, 4]])
>>> np.stack((a, b), axis=-1)
array([[1, 2],
[2, 3],
[3, 4]])
进一步举例 stack()、 hstack()、vstack()
import numpy as np
# stack()是按照不同轴的堆叠方式重新堆叠数组
a=[[1,2,3],[4,5,6]]
np.stack(a,axis=0)
# array([[1, 2, 3],
# [4, 5, 6]])
np.stack(a,axis=1)
# array([[1, 4],
# [2, 5],
# [3, 6]])
#可以看出axis=0是把原来的元素按照横轴的方式排列,axis=1是把原先元素按照纵轴排列
# 更多的例子
a=[[1,2,3,4],[5,6,7,8],[9,10,11,12]]
np.stack(a,axis=0)
#array([[ 1, 2, 3, 4],
# [ 5, 6, 7, 8],
# [ 9, 10, 11, 12]])
np.stack(a,axis=1)
#array([[ 1, 5, 9],
# [ 2, 6, 10],
# [ 3, 7, 11],
# [ 4, 8, 12]])
a=[1,2,3,4]
b=[5,6,7,8]
c=[9,10,11,12]
np.stack((a,b,c),axis=0)
#array([[ 1, 2, 3, 4],
# [ 5, 6, 7, 8],
# [ 9, 10, 11, 12]])
np.stack((a,b,c),axis=1)
#array([[ 1, 5, 9],
# [ 2, 6, 10],
# [ 3, 7, 11],
# [ 4, 8, 12]])
a=[[1,2,3],[4,5,6]]
b=[[7,8,9],[10,11,12]]
c=[[13,14,15],[16,17,18]]
np.stack((a,b,c),axis=0)
#array([[[ 1, 2, 3],
# [ 4, 5, 6]],
# [[ 7, 8, 9],
# [10, 11, 12]],
# [[13, 14, 15],
# [16, 17, 18]]])
np.stack((a,b,c),axis=1)
#array([[[ 1, 2, 3],
# [ 7, 8, 9],
# [13, 14, 15]],
# [[ 4, 5, 6],
# [10, 11, 12],
# [16, 17, 18]]])
#hstack()、vstack()是按元素进行堆叠而不是数组的形状堆叠,具体与stack的区别后面有个例子
a=[1,2,3]
b=[4,5,6]
np.hstack((a,b))
#array([1, 2, 3, 4, 5, 6])
np.vstack((a,b))
#array([[1, 2, 3],
# [4, 5, 6]])
#我们来看一下这三个函数对于复杂的矩阵堆叠的区别
a=[[1],[2],[3]]
b=[[4],[5],[6]]
c=[[7],[8],[9]]
np.stack((a,b,c),axis=0)
#array([[[1],
# [2],
# [3]],
# [[4],
# [5],
# [6]],
# [[7],
# [8],
# [9]]])
np.stack((a,b,c),axis=1)
#array([[[1],
# [4],
# [7]],
# [[2],
# [5],
# [8]],
# [[3],
# [6],
3 [9]]])
np.hstack((a,b,c))
#array([[1, 4, 7],
# [2, 5, 8],
# [3, 6, 9]])
np.vstack((a,b,c))
#array([[1],
# [2],
# [3],
# [4],
# [5],
# [6],
# [7],
# [8],
# [9]])
#再来看一个
a=[[1,2,3],[4,5,6]]
b=[[7,8,9],[10,11,12]]
c=[[13,14,15],[16,17,18]]
np.stack((a,b,c),axis=0)
#array([[[ 1, 2, 3],
# [ 4, 5, 6]],
# [[ 7, 8, 9],
# [10, 11, 12]],
# [[13, 14, 15],
# [16, 17, 18]]])
np.stack((a,b,c),axis=1)
#array([[[ 1, 2, 3],
# [ 7, 8, 9],
# [13, 14, 15]],
# [[ 4, 5, 6],
# [10, 11, 12],
# [16, 17, 18]]])
np.hstack((a,b,c))
#array([[ 1, 2, 3, 7, 8, 9, 13, 14, 15],
# [ 4, 5, 6, 10, 11, 12, 16, 17, 18]])
np.vstack((a,b,c))
#array([[ 1, 2, 3],
# [ 4, 5, 6],
# [ 7, 8, 9],
# [10, 11, 12],
# [13, 14, 15],
# [16, 17, 18]])
#可以看出stack是在不破坏原有矩阵形状的情况下按照横或纵的方式堆叠,hstack和vstack更进一步,打破了原有矩阵的结构
np.apply_along_axis()
系统中的解释 help(np.apply_along_axis)
help(np.apply_along_axis)
Help on function apply_along_axis in module numpy:
apply_along_axis(func1d, axis, arr, *args, **kwargs)
Apply a function to 1-D slices along the given axis.
Execute `func1d(a, *args, **kwargs)` where `func1d` operates on 1-D arrays
and `a` is a 1-D slice of `arr` along `axis`.
This is equivalent to (but faster than) the following use of `ndindex` and
`s_`, which sets each of ``ii``, ``jj``, and ``kk`` to a tuple of indices::
Ni, Nk = a.shape[:axis], a.shape[axis+1:]
for ii in ndindex(Ni):
for kk in ndindex(Nk):
f = func1d(arr[ii + s_[:,] + kk])
Nj = f.shape
for jj in ndindex(Nj):
out[ii + jj + kk] = f[jj]
Equivalently, eliminating the inner loop, this can be expressed as::
Ni, Nk = a.shape[:axis], a.shape[axis+1:]
for ii in ndindex(Ni):
for kk in ndindex(Nk):
out[ii + s_[...,] + kk] = func1d(arr[ii + s_[:,] + kk])
Parameters
----------
func1d : function (M,) -> (Nj...)
This function should accept 1-D arrays. It is applied to 1-D
slices of `arr` along the specified axis.
axis : integer
Axis along which `arr` is sliced.
arr : ndarray (Ni..., M, Nk...)
Input array.
args : any
Additional arguments to `func1d`.
kwargs : any
Additional named arguments to `func1d`.
.. versionadded:: 1.9.0
Returns
-------
out : ndarray (Ni..., Nj..., Nk...)
The output array. The shape of `out` is identical to the shape of
`arr`, except along the `axis` dimension. This axis is removed, and
replaced with new dimensions equal to the shape of the return value
of `func1d`. So if `func1d` returns a scalar `out` will have one
fewer dimensions than `arr`.
See Also
--------
apply_over_axes : Apply a function repeatedly over multiple axes.
Examples
--------
>>> def my_func(a):
... """Average first and last element of a 1-D array"""
... return (a[0] + a[-1]) * 0.5
>>> b = np.array([[1,2,3], [4,5,6], [7,8,9]])
>>> np.apply_along_axis(my_func, 0, b)
array([4., 5., 6.])
>>> np.apply_along_axis(my_func, 1, b)
array([2., 5., 8.])
For a function that returns a 1D array, the number of dimensions in
`outarr` is the same as `arr`.
>>> b = np.array([[8,1,7], [4,3,9], [5,2,6]])
>>> np.apply_along_axis(sorted, 1, b)
array([[1, 7, 8],
[3, 4, 9],
[2, 5, 6]])
For a function that returns a higher dimensional array, those dimensions
are inserted in place of the `axis` dimension.
>>> b = np.array([[1,2,3], [4,5,6], [7,8,9]])
>>> np.apply_along_axis(np.diag, -1, b)
array([[[1, 0, 0],
[0, 2, 0],
[0, 0, 3]],
[[4, 0, 0],
[0, 5, 0],
[0, 0, 6]],
[[7, 0, 0],
[0, 8, 0],
[0, 0, 9]]])
plt.contour()
import matplotlib.pyplot as plt
plt.contour([X, Y,] Z, [levels], ** kwargs)
plt.contour(x, y, z, 等高线条数,colors=颜色, linewidth=线宽)#等高线绘制
plt.contourf(x, y, z, 等高线条数,cmap=颜色映射)# 等高线填充
示例1
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(1, 10)
y = x.reshape(-1, 1)
h = x * y
cs = plt.contourf(h, levels=[10, 30, 50],
colors=['#808080', '#A0A0A0', '#C0C0C0'], extend='both')
cs.cmap.set_over('red')
cs.cmap.set_under('blue')
cs.changed()
示例2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 计算x,y坐标对应的高度值
def f(x, y):
return (1-x/2+x**5+y**3) * np.exp(-x**2-y**2)
# 生成x,y的数据
n = 256
x = np.linspace(-3, 3, n)
y = np.linspace(-3, 3, n)
# 把x,y数据生成mesh网格状的数据,因为等高线的显示是在网格的基础上添加上高度值
X, Y = np.meshgrid(x, y)
# 填充等高线
plt.contourf(X, Y, f(X, Y), 20, cmap=plt.cm.hot)
# 添加等高线
C = plt.contour(X, Y, f(X, Y), 20)
plt.clabel(C, inline=True, fontsize=12)
# 显示图表
plt.show()
示例3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 计算x,y坐标对应的高度值
def f(x,y):
#the height function
return(1-x/2+x**5+y**3)*np.exp(-x**2-y**2)
# 生成x,y的数据
x = np.linspace(-4, 3, 500)
y = np.linspace(-4, 3, 500)
X, Y = np.meshgrid(x, y)# 把x,y数据生成mesh网格状的数据,因为等高线的显示是在网格的基础上添加上高度值
c = plt.contour(X, Y, f(X, Y),colors = 'm',linewidth=.5)
plt.clabel(c,inline=False,fontsize=10)#等高线标签
plt.show()
plt.clabel()
clabel - 标记等高线,向CS(由contour函数返回的matplotlib.contour.ContourSet对象)中的轮廓线添加标签
clabel(CS, *args, **kwargs)
- CS - 由contour函数产生的句柄对象
- fontsize - string(smaller, x-large) or float ,optional
- colors - Color of each label
- None, 标记的颜色为轮廓的颜色
- one string color (e.g color = ‘r’ ), 所有的标签均为红色
- a tuple of matplotlib color args (string, float, rgb, etc), 不同的标签按照指定的颜色标记
- inline - bool, optional . 默认True(在标签位值移除轮廓线,也即标签覆盖轮廓线,而非穿越)
- inline_spacing - float, optional,默认5,放置内联时,标签两侧留有的像素空间
- fmt - string or dict, optional, 默认’%1.3f ',保留小数位。1.3中的1表示输出位宽,3表示小数位长度,此时实际数据会覆盖掉该数据对应的轮廓线;当9.3时,轮廓线会被覆盖掉9个位置,同时小数点后保留3位,也就是说,轮廓线移除的长度大于数据长度
- manual - bool or iterable, optional , 手动添加标签。忽略该字典
- rightside_up - bool, optional,默认 True(标签旋转均以正负90度计)
- use_clabeltext - bool, optional,默认False,若为True,则用
ClabelText
class (instead ofText
) 创建标签,当使用“CababelTress”绘制文本时,会重新计算文本的旋转角度,因此,如果轴的角度发生变化时,可以使用“CababelTress”来旋转角度。
plt.hlines() & plt.vlines()
hlines(y, xmin, xmax, colors='k', linestyles='solid', label='', *, data=None, **kwargs)
Plot horizontal lines at each *y* from *xmin* to *xmax*.
Parameters
----------
y : scalar or sequence of scalar
y-indexes where to plot the lines.
xmin, xmax : scalar or 1D array_like
Respective beginning and end of each line. If scalars are
provided, all lines will have same length.
colors : array_like of colors, optional, default: 'k'
linestyles : {'solid', 'dashed', 'dashdot', 'dotted'}, optional
label : string, optional, default: ''
示例
## 截取上面的代码
plt.hlines((u2_min,u2_max),u1_min,u1_max)
plt.vlines((u1_min,u1_max),u2_min,u2_max)
##plt.hlines((开始点1 , 开始点2) , 从哪开始画, 画到哪))
第五次作业
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#### P83 4.3.1
#### ---------------------------------------------
## 1. 协方差相等_未知_均值向量的检验
#### ---------------------------------------------
dat= pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp4.3.1.xlsx")
X1 = dat.iloc[0:6,0:3]
X2 = dat.iloc[6:,0:3]
n1 = X1.shape[0]
n2 = X2.shape[0]
p = X1.shape[1]
X1_bar = X1.mean()
X2_bar = X2.mean()
Z_bar = X1_bar-X2_bar
S1 = X1.cov()
S2 = X2.cov()
Sp = (n1-1)/(n1+n2-2)*S1 + (n2-1)/(n1+n2-2)*S2
T2 = n1*n2/(n1+n2)*Z_bar.T.dot(np.linalg.inv(Sp)).dot(Z_bar)
F_value = ((n1+n2-2)-p+1)/((n1+n2-2)*p) * T2
#服从F(3,11)分布 F(p,n1+n2-p-1)
alpha = 0.05
rv_f = stats.f(3,11)
F_ctri = rv_f.ppf(1-alpha)
#此时F_value < F_ctri,落在接收域内,接受H0,认为均值无差异
P_value = 1 - rv_f.cdf(F_value) ; P_value
#### ---------------------------------------------
## 2. 协方差不相等_未知_均值向量的检验
#### ---------------------------------------------
X1_n = X1
X2_n = X2.iloc[:6,]
X2_n.sum()
X2.sum()
Z = pd.DataFrame(X1_n.values - (n1/n2)**0.5 * X2_n.values,columns=['x1','x2','x3']) + (n1*n2)**(-0.5) * X2_n.sum() - (1/n2) * X2.sum()
Z_bar = Z.mean()
Sz = Z.cov()
T2 = 6 * Z_bar.T.dot(np.linalg.inv(Sz).dot(Z_bar)) ; T2
F_value = 3/15 * T2 ;F_value
alpha = 0.05
rv_f = stats.f(3,3)
F_ctri = rv_f.ppf(1-alpha)
# F_value < F_ctri 落在接收域内
P_value = 1 - rv_f.cdf(F_value) ; P_value
#### ---------------------------------------------
## 3. 哪一个显著性更好
#### ---------------------------------------------
# 1 首先二者的样本的协方差相差并不多
S1 = X1.cov()
S2 = X2.cov()
# 2 协方差相等_未知_均值向量的检验 的 P 值更大
# 协方差相等_未知_均值向量的检验 P 值 :0.2692615756289385
# 协方差不相等_未知_均值向量的检验 P 值 :0.24551618103606898
# 故选择 协方差相等_未知_均值向量的检验 可能更好
#### P80 4.2.2
# confidence region
def conf(n, x_bar, u, S):
return n*(x_bar-u).T.dot(np.linalg.inv(S)).dot(x_bar-u)
def dat_conf(u):
return conf(dat_n , dat_x_bar , u , dat_S)
dat=pd.read_excel('F:\\基础数学课\\应用多元统计分析\\examp4.2.2_homework.xlsx')
dat_x_bar = dat.mean()
dat_S = dat.cov()
dat_n = dat.shape[0]
dat_p = dat.shape[1]
#### ---------------------------------------------
#### 置信区域
#### ---------------------------------------------
U1,U2 = np.mgrid[40:110:101j,50:110:101j]
U = np.stack((np.ravel(U1),np.ravel(U2))).T
Z = np.apply_along_axis(dat_conf,1,U).reshape(U1.shape)
fig=plt.figure()
ax=fig.gca(projection="3d")
ax.plot_surface(U1,U1,Z)
plt.show()
plt.contour(U1,U2,Z)
rv_f = stats.f(dat_p,dat_n-dat_p)
F_crit = rv_f.ppf(0.90)
T2_crit = F_crit * (dat_p)*(dat_n-1)/(dat_n-dat_p)
ct1 = plt.contour(U1,U2,Z,[T2_crit])
plt.ylim(50,100)
plt.xlim(50,100)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")
#### ---------------------------------------------
#### 联合置信区间
#### ---------------------------------------------
u1_min = dat_x_bar[0] - T2_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u1_max = dat_x_bar[0] + T2_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u2_min = dat_x_bar[1] - T2_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
u2_max = dat_x_bar[1] + T2_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
ct1 = plt.contour(U1,U2,Z,[T2_crit])
plt.ylim(66,90)
plt.xlim(60,85)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")
plt.hlines((u2_min,u2_max),u1_min,u1_max)
plt.vlines((u1_min,u1_max),u2_min,u2_max)
#### ---------------------------------------------
#### Bonferroni 联合置信区间
#### ---------------------------------------------
rv_t = stats.t(dat_n-1)
alpha = 0.1
k = 2
tk_crit = rv_t.ppf(1-alpha/(2*k))
u1_min_b = dat_x_bar[0] - tk_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u1_max_b = dat_x_bar[0] + tk_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u2_min_b = dat_x_bar[1] - tk_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
u2_max_b = dat_x_bar[1] + tk_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
#### ---------------------------------------------
#### 联合置信区间 和 Bonferroni 联合置信区间
#### ---------------------------------------------
ct1 = plt.contour(U1,U2,Z,[T2_crit])
plt.ylim(66,90)
plt.xlim(60,85)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")
plt.hlines((u2_min,u2_max),u1_min,u1_max)
plt.vlines((u1_min,u1_max),u2_min,u2_max)
plt.hlines((u2_min_b,u2_max_b),u1_min_b,u1_max_b,'red')
plt.vlines((u1_min_b,u1_max_b),u2_min_b,u2_max_b,'red')