应用多元统计分析（5）第四章多元正太总体的推断统计

最新推荐文章于 2024-04-27 20:58:54 发布

HarryStudyPython_ing

最新推荐文章于 2024-04-27 20:58:54 发布

阅读量1k

点赞数 1

分类专栏： Python 应用多元统计分析（教材：王学明）

本文链接：https://blog.csdn.net/qq_47805483/article/details/109343861

版权

Python 应用多元统计分析（教材：王学明）专栏收录该内容

9 篇文章 12 订阅

订阅专栏

第四章多元正太总体的推断统计

课堂代码整理及注释

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mayavi.mlab import *

#### ---------------------------------------------
#### 单个总体_协方差阵未知_均值向量的检验 
#### ---------------------------------------------

## 例 4.2.1

dat = pd.read_clipboard()   ##可以直接复制剪切板数据
dat = pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp4.2.1.xlsx")

x_bar = dat.mean()

# H0:
u0 = [90,58,16]  #H0

n = dat.shape[0]    # .shape[0] 读出有多少行 即有多少个样本 n
p = dat.shape[1]    # .shape[1] 读出有多少列 即有多少个因素 p

n , p = dat.shape    # 利用 tuple 赋值特点

S = dat.cov()    # 样本协方差

# 构造检验统计量 T2    T^2 = n * (x_bar-u0)' * S^(-1) * (x_bar-u0)
T2 = n*(x_bar-u0).T.dot(np.linalg.inv(S)).dot(x_bar-u0) ; T2

F_value =(n-p)/((n-1)*p)*T2 ; F_value    # 求出对应的 F_value 

# 插曲 画出 F(p,n-p) 的概率密度曲线
rv_f = stats.f(p,n-p)
xs= np.linspace(0,10,101)
ys= rv_f.pdf(xs)
plt.plot(xs,ys)

alpha = 0.01    #显著性水平
F_crit = rv_f.ppf(1-alpha) ; F_crit     ## 概率密度为1-alpha的时候对应的x的值

# 结果 F_value > F_ctri 落在拒绝域内

P_value = 1 - rv_f.cdf((n-p)/((n-1)*p)*T2) ; P_value    ##计算概率 P 值可以得到同样的结果


#### --------------------------------------------- 
#### 两总体_协方差相等_未知_均值向量的检验
#### ---------------------------------------------

## 例 4.3.1

dat= pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp4.3.1.xlsx")

X1 = dat.iloc[0:6,0:3]    # 取出男婴样本
X2 = dat.iloc[6:,0:3]    # 取出女婴样本

# X2.index = ['0','1','2','3','4','5','6','7','8']

n1 = X1.shape[0]    # 行数即样本数
n2 = X2.shape[0]
p = X1.shape[1]    # 列数即因素个数

X1_bar = X1.mean()
X2_bar = X2.mean()

Z_bar = X1_bar-X2_bar

S1 = X1.cov()
S2 = X2.cov()

Sp = (n1-1)/(n1+n2-2)*S1 + (n2-1)/(n1+n2-2)*S2    # 协方差的联合无偏估计

T2 = n1*n2/(n1+n2)*Z_bar.T.dot(np.linalg.inv(Sp)).dot(Z_bar)

F_value = ((n1+n2-2)-p+1)/((n1+n2-2)*p) * T2

#服从F（3,11）分布  F(p,n1+n2-p-1)

alpha = 0.05

rv_f = stats.f(3,11)
F_ctri = rv_f.ppf(1-alpha)

#此时F_value < F_ctri,落在接收域内，接受H0，认为均值无差异

P_value = 1 - rv_f.cdf(F_value) ; P_value    ##计算概率 P 值可以得到同样的结果


#### --------------------------------------------- 
#### 两总体_协方差阵不等_样本容量不同_均值向量的检验
#### --------------------------------------------- 

## 例 4.3.1

X1_n = X1
X2_n = X2.iloc[:6,]
X2_n.sum()
X2.sum()

Z = pd.DataFrame(X1_n.values - (n1/n2)**0.5 * X2_n.values,columns=['x1','x2','x3']) + (n1*n2)**(-0.5) * X2_n.sum() - (1/n2) * X2.sum()
# Z 的注释  检验统计量的构造见 PPT 第四章 P39
# X1_n.values : 将DataFrame转换为array
# Z 中包含 DataFrame 和 array 的运算过于复杂 需要结合运行结果分析

Z_bar = Z.mean()
Sz = Z.cov()

T2 = 6 * Z_bar.T.dot(np.linalg.inv(Sz).dot(Z_bar)) ; T2

F_value = 3/15 * T2 ;F_value 

#Q3:F_value = (n1-p)*n1/p * T2

alpha = 0.05

rv_f = stats.f(3,3)
F_ctri = rv_f.ppf(1-alpha)

# F_value < F_ctri  落在接收域内

#### --------------------------------------------- 
#### 多个总体_协方差矩阵相等性的检验
#### --------------------------------------------- 

## BOX M 统计量
## 4.5.1
dat= pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp4.5.1.xlsx")

X1 = dat.iloc[0:20,0:4]    # 取出销售方案 1
X2 = dat.iloc[20:40,0:4]    # 取出销售方案 2
X3 = dat.iloc[40:,0:4]    # 取出销售方案 3

S1 = X1.cov()
S2 = X2.cov()
S3 = X3.cov()

n1 = n2 = n3 = 20
n = n1 + n2 +n3
p = 4
k = 3

Sp = 1/57 * (19*S1 + 19*S2 +19*S3)

## BOX M 统计量
M = 57*np.log(np.linalg.det(Sp)) - 19*(np.log(np.linalg.det(S1)) + np.log(np.linalg.det(S2)) + np.log(np.linalg.det(S3)))


#### --------------------------------------------- 
#### 插曲： 验证下 F 和 t 的值
#### --------------------------------------------- 
stats.t.ppf(0.975,19)**2
stats.f.ppf(0.95,1,19)


#### --------------------------------------------- 
####  置信区域 和 联合置信区间
#### --------------------------------------------- 


#### --------------------------------------------- 
## 2 维
## 4.2.2
#### --------------------------------------------- 

# 二元变量  p=2

# confidence region
def conf(n, x_bar, u, S):
    return n*(x_bar-u).T.dot(np.linalg.inv(S)).dot(x_bar-u)


dat=pd.read_excel('F:\\基础数学课\\应用多元统计分析\\examp4.2.2.xlsx')

dat_x_bar = dat.mean()
dat_S = dat.cov()
dat_n = dat.shape[0]
dat_p = dat.shape[1]

# confidence region
def dat_conf(u):
    return conf(dat_n , dat_x_bar , u , dat_S)

# S 是用离差阵还是cov？ 样本方差阵

dat_conf([1,3])    ## 示例

#### --------------------------------------------- 
####   置信区域
#### --------------------------------------------- 

U1,U2 = np.mgrid[40:110:101j,50:110:101j]   ###  等价于下面的 np.linspace() + meshgrid()

u1s = np.linspace(40,110,101)
u2s = np.linspace(50,110,101)

U1 , U2 = np.meshgrid(u1s,u2s)

U = np.stack((np.ravel(U1),np.ravel(U2))).T    ## help(np.stack)

Z = np.apply_along_axis(dat_conf,1,U).reshape(U1.shape)    ## help(np.apply_along_axis)

##注解： numpy.apply_along_axis(func, axis, arr, *args, **kwargs)
# 2.作用：
# 将arr数组的每一个元素经过func函数变换形成的一个新数组

# 3.参数介绍:
# 其中func，axis，arr是必选的
# func是我们写的一个函数
# axis表示函数func对arr是作用于行还是列
# arr便是我们要进行操作的数组了
# 可选参数：*args, **kwargs。都是func()函数额外的参数

fig=plt.figure()
ax=fig.gca(projection="3d")
ax.plot_surface(U1,U1,Z)
plt.show()

plt.contour(U1,U2,Z)

#改变画布大小
plt.ylim(0,200)
plt.xlim(0,200)

rv_f = stats.f(dat_p,dat_n-dat_p)
F_crit = rv_f.ppf(0.90)
T2_crit = F_crit * (dat_p)*(dat_n-1)/(dat_n-dat_p)

ct1 = plt.contour(U1,U2,Z,[T2_crit])    ## help(plt.contour)
plt.ylim(50,100)
plt.xlim(50,100)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")


#### --------------------------------------------- 
#### 联合置信区间
#### --------------------------------------------- 

u1_min = dat_x_bar[0] - T2_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u1_max = dat_x_bar[0] + T2_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5

u2_min = dat_x_bar[1] - T2_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
u2_max = dat_x_bar[1] + T2_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5

ct1 = plt.contour(U1,U2,Z,[T2_crit])
plt.ylim(50,100)
plt.xlim(50,100)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")
plt.hlines((u2_min,u2_max),u1_min,u1_max)
plt.vlines((u1_min,u1_max),u2_min,u2_max)

#### --------------------------------------------- 
####   Bonferroni 联合置信区间
#### --------------------------------------------- 
rv_t = stats.t(dat_n-1)
alpha = 0.05
k = 2
tk_crit = rv_t.ppf(1-alpha/(2*k))

u1_min_b = dat_x_bar[0] - tk_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u1_max_b = dat_x_bar[0] + tk_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5

u2_min_b = dat_x_bar[1] - tk_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
u2_max_b = dat_x_bar[1] + tk_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5

#### --------------------------------------------- 
#### 联合置信区间 和 Bonferroni 联合置信区间
#### --------------------------------------------- 
ct1 = plt.contour(U1,U2,Z,[T2_crit]) 
plt.ylim(50,100)
plt.xlim(50,100)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")
plt.hlines((u2_min,u2_max),u1_min,u1_max)
plt.vlines((u1_min,u1_max),u2_min,u2_max)
plt.hlines((u2_min_b,u2_max_b),u1_min_b,u1_max_b,'red')
plt.vlines((u1_min_b,u1_max_b),u2_min_b,u2_max_b,'red')


#### --------------------------------------------- 
## 3 维
# 三元变量  p=3
#### --------------------------------------------- 

def conf(n, x_bar, u, S):
    return n*(x_bar-u).T.dot(np.linalg.inv(S)).dot(x_bar-u)

dat=pd.read_excel('F:\\基础数学课\\应用多元统计分析\\examp4.2.1.xlsx')
dat_x_bar = dat.mean()
dat_S = dat.cov()
dat_n = dat.shape[0]
dat_p = dat.shape[1]

def dat_conf(u):
    return conf(dat_n , dat_x_bar , u , dat_S)

dat_conf([1,1,1])

U1,U2,U3 = np.mgrid[64:100:51j,54:66:51j,10:19:51j]  ## 等价于下面的 np.linspace() + meshgrid()

u1s = np.linspace(40,110,101)
u2s = np.linspace(50,110,101)
U1 , U2 = np.meshgrid(u1s,u2s)

U = np.stack((np.ravel(U1),np.ravel(U2),np.ravel(U3))).T

Z = np.apply_along_axis(dat_conf,1,U).reshape(U1.shape)

rv_f = stats.f(dat_p,dat_n-dat_p)
F_crit = rv_f.ppf(0.95)
T2_crit = F_crit * (dat_p)*(dat_n-1)/(dat_n-dat_p)

contour3d(Z , contours=[T2_crit],transparent = True)
outline()
axes()
show()

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = Axes3D(fig)
ax.plot_surface(U1, U2, Z)
plt.show()

其中部分函数的参数说明整理

关于scipy.stats库的基础（关于概率分布）

1.scipy.stats库中各分布对应的方法

from scipy import stats
# 正态分布
stats.norm
# 卡方分布
stats.chi2
# t分布
stats.t
# F分布
stats.f

2.stats库中各分布的常用方法及其功能

在这里插入图片描述

对于正态分布：

stats.norm.cdf(α,均值,方差)；

stats.norm.pdf(α,均值,方差)；

stats.norm.isf(α,均值,方差)；

对于t分布：

stats.t.cdf(α,自由度)；

stats.t.pdf(α,自由度)；

stats.t.isf(α,自由度)；

对于F分布：

stats.f.cdf(α,自由度1,自由度2)；

stats.f.pdf(α,自由度1,自由度2)；

stats.f.isf(α,自由度1,自由度2)；

一个简单的案例说明：

# 对于正态分布
stats.norm.cdf(0.5,2,3)
stats.norm.pdf(0.5,2,3)
stats.norm.isf(0.05,2,3)
# 对于t分布
stats.t.cdf(0.5,10)
stats.t.pdf(0.5,10)
stats.t.isf(0.0005,45)

在这里插入图片描述

以norm分布举例来计算偏度和峰度

import scipy.stats as st

st.norm.cdf(0)

st.norm(3,1).cdf(3)
st.norm.cdf(3,3,1)
st.norm.ppf(0.5,3,1)

>>> import scipy.stats as st
>>> a = [89, 23, 45, 18]

>>> st.skew(a) # 计算偏度
0.7565543738808015

>>> st.kurtosis(a) # 计算峰度
-1.0489580648783101

>>> import pandas as pd
>>> import numpy as np
>>> df = pd.DataFrame(np.array([[85, 68, 90], [82, 63, 88], [84, 90, 78]]), columns=['统计学', '高数', '英语'], index=['张三', '李四', '王五'])
>>> df
    统计学  高数  英语
张三   85  68  90
李四   82  63  88
王五   84  90  78

>>> df.iloc[1, :].skew() # 计算第二行的偏度
-1.3294040702410526

>>> df.skew(axis = 0) # 计算所有列的偏度
统计学   -0.935220
高数     1.498959
英语    -1.545393
dtype: float64

>>> df.skew(axis = 1) # 计算所有行的偏度
张三   -1.373033
李四   -1.329404
王五    0.000000
dtype: float64

>>> df1 = pd.DataFrame(np.array([[85, 68, 90, 65], [82, 63, 88, 83], [84, 90, 78, 90], [72, 68, 91, 84]]), columns=['统计学', '高数', '英语', '计算机'], index=['张三', '李四', '王五', '马六'])
>>> df1
    统计学  高数  英语  计算机
张三   85  68  90   65
李四   82  63  88   83
王五   84  90  78   90
马六   72  68  91   84

>>> df1.kurt(axis = 0) # 计算 df1 所有列的偏度
统计学    3.090874
高数     3.365664
英语     3.090874
计算机    2.769386
dtype: float64

>>> df1.iloc[:, 2].kurt() #计算 df1 第 3 列的偏度
3.090874188966101

3.概率密度函数及其图象

1. 正太分布

x = np.linspace(-5,5,100000)
y = stats.norm.pdf(x,0,1)
plt.plot(x,y,c="red")
plt.title('正态分布的概率密度函数')
 
plt.tight_layout()
plt.savefig("正态分布的概率密度函数",dpi=300)

2. 卡方分布

x = np.linspace(0,100,100000)
color = ["blue","green","darkgrey","darkblue","orange"]
for i in range(10,51,10):
    y=stats.chi2.pdf(x,df=i)
    plt.plot(x,y,c=color[int((i-10)/10)])
plt.title('卡方分布')
 
plt.tight_layout()
plt.savefig(" 布的概率密度函数",dpi=300)

3. t分布

x = np.linspace(-5,5,100000)
y = stats.t.pdf(x_t,2)
plt.plot(x,y,c="orange")
plt.title('t分布的概率密度函数')
 
plt.tight_layout()
plt.savefig("t分布的概率密度函数",dpi=300)

4. python绘制t分布和正态分布的概率密度函数对比图

x_norm = np.linspace(-5,5,100000)
y_norm = stats.norm.pdf(x_norm,0,1)
plt.plot(x_norm,y_norm,c="black")
 
color = ["green","darkblue","orange"]
 
x_t = np.linspace(-5,5,100000)
for i in range(1,4,1):
    y_t = stats.t.pdf(x_t,i)
    plt.plot(x_t,y_t,c=color[int(i-1)])
    
plt.title('t分布和正态分布的概率密度函数对比图')
 
plt.tight_layout()
plt.savefig("t分布和正态分布的概率密度函数对比图",dpi=300)

5. F分布

x = np.linspace(-1,8,100000)
y1 = stats.f.pdf(x,1,10)
y2 = stats.f.pdf(x,5,10)
y3 = stats.f.pdf(x,10,10)
plt.plot(x,y1)
plt.plot(x,y2)
plt.plot(x,y3)
plt.ylim(0,1)
plt.title('F分布的概率密度函数')
 
plt.tight_layout()
plt.savefig("F分布的概率密度函数",dpi=300)

np.stack（）

系统中的解释 help(np.stack)

help(np.stack)
Help on function stack in module numpy:

stack(arrays, axis=0, out=None)
    Join a sequence of arrays along a new axis.
    
    The ``axis`` parameter specifies the index of the new axis in the
    dimensions of the result. For example, if ``axis=0`` it will be the first
    dimension and if ``axis=-1`` it will be the last dimension.
    
    .. versionadded:: 1.10.0
    
    Parameters
    ----------
    arrays : sequence of array_like
        Each array must have the same shape.
    
    axis : int, optional
        The axis in the result array along which the input arrays are stacked.
    
    out : ndarray, optional
        If provided, the destination to place the result. The shape must be
        correct, matching that of what stack would have returned if no
        out argument were specified.
    
    Returns
    -------
    stacked : ndarray
        The stacked array has one more dimension than the input arrays.
    
    See Also
    --------
    concatenate : Join a sequence of arrays along an existing axis.
    block : Assemble an nd-array from nested lists of blocks.
    split : Split array into a list of multiple sub-arrays of equal size.
    
    Examples
    --------
    >>> arrays = [np.random.randn(3, 4) for _ in range(10)]
    >>> np.stack(arrays, axis=0).shape
    (10, 3, 4)
    
    >>> np.stack(arrays, axis=1).shape
    (3, 10, 4)
    
    >>> np.stack(arrays, axis=2).shape
    (3, 4, 10)
    
    >>> a = np.array([1, 2, 3])
    >>> b = np.array([2, 3, 4])
    >>> np.stack((a, b))
    array([[1, 2, 3],
           [2, 3, 4]])
    
    >>> np.stack((a, b), axis=-1)
    array([[1, 2],
           [2, 3],
           [3, 4]])

进一步举例 stack()、 hstack()、vstack()

import numpy as np
 
# stack()是按照不同轴的堆叠方式重新堆叠数组
 
a=[[1,2,3],[4,5,6]]
 
np.stack(a,axis=0)
# array([[1, 2, 3],
#       [4, 5, 6]])
 
np.stack(a,axis=1)
# array([[1, 4],
#       [2, 5],
#       [3, 6]])
 
#可以看出axis=0是把原来的元素按照横轴的方式排列，axis=1是把原先元素按照纵轴排列
 
# 更多的例子
 
a=[[1,2,3,4],[5,6,7,8],[9,10,11,12]]
 
np.stack(a,axis=0)
#array([[ 1,  2,  3,  4],
#       [ 5,  6,  7,  8],
#       [ 9, 10, 11, 12]])
 
np.stack(a,axis=1)
#array([[ 1,  5,  9],
#       [ 2,  6, 10],
#       [ 3,  7, 11],
#       [ 4,  8, 12]])
 
a=[1,2,3,4]
 
b=[5,6,7,8]
 
c=[9,10,11,12]
 
np.stack((a,b,c),axis=0)
#array([[ 1,  2,  3,  4],
#       [ 5,  6,  7,  8],
#       [ 9, 10, 11, 12]])
 
np.stack((a,b,c),axis=1)
#array([[ 1,  5,  9],
#       [ 2,  6, 10],
#       [ 3,  7, 11],
#       [ 4,  8, 12]])
 
a=[[1,2,3],[4,5,6]]
 
b=[[7,8,9],[10,11,12]]
 
c=[[13,14,15],[16,17,18]]
 
np.stack((a,b,c),axis=0)
#array([[[ 1,  2,  3],
#        [ 4,  5,  6]],
 
#       [[ 7,  8,  9],
#        [10, 11, 12]],
 
#       [[13, 14, 15],
#        [16, 17, 18]]])
 
np.stack((a,b,c),axis=1)
#array([[[ 1,  2,  3],
#        [ 7,  8,  9],
#        [13, 14, 15]],
 
#       [[ 4,  5,  6],
#        [10, 11, 12],
#        [16, 17, 18]]])
 
 
#hstack()、vstack()是按元素进行堆叠而不是数组的形状堆叠，具体与stack的区别后面有个例子
 
a=[1,2,3]
 
b=[4,5,6]
 
np.hstack((a,b))
#array([1, 2, 3, 4, 5, 6])
 
np.vstack((a,b))
#array([[1, 2, 3],
#       [4, 5, 6]])
 
#我们来看一下这三个函数对于复杂的矩阵堆叠的区别
 
a=[[1],[2],[3]]
 
b=[[4],[5],[6]]
 
c=[[7],[8],[9]]
 
 
np.stack((a,b,c),axis=0)
#array([[[1],
#        [2],
#        [3]],
 
#       [[4],
#        [5],
#        [6]],
 
#       [[7],
#        [8],
#        [9]]])
 
 
np.stack((a,b,c),axis=1)
#array([[[1],
#        [4],
#        [7]],
 
#       [[2],
#        [5],
#        [8]],
 
#       [[3],
#        [6],
3        [9]]])
 
np.hstack((a,b,c))
#array([[1, 4, 7],
#       [2, 5, 8],
#       [3, 6, 9]])
 
np.vstack((a,b,c))
#array([[1],
#       [2],
#       [3],
#       [4],
#       [5],
#       [6],
#       [7],
#      [8],
#       [9]])
 
#再来看一个
a=[[1,2,3],[4,5,6]]
 
b=[[7,8,9],[10,11,12]]
 
c=[[13,14,15],[16,17,18]]
 
np.stack((a,b,c),axis=0)
#array([[[ 1,  2,  3],
#        [ 4,  5,  6]],
 
#       [[ 7,  8,  9],
#        [10, 11, 12]],
 
#       [[13, 14, 15],
#        [16, 17, 18]]])
 
np.stack((a,b,c),axis=1)
#array([[[ 1,  2,  3],
#        [ 7,  8,  9],
#        [13, 14, 15]],
 
#       [[ 4,  5,  6],
#        [10, 11, 12],
#        [16, 17, 18]]])
 
np.hstack((a,b,c))
#array([[ 1,  2,  3,  7,  8,  9, 13, 14, 15],
#       [ 4,  5,  6, 10, 11, 12, 16, 17, 18]])
 
np.vstack((a,b,c))
#array([[ 1,  2,  3],
#       [ 4,  5,  6],
#       [ 7,  8,  9],
#       [10, 11, 12],
#       [13, 14, 15],
#       [16, 17, 18]])
 
#可以看出stack是在不破坏原有矩阵形状的情况下按照横或纵的方式堆叠，hstack和vstack更进一步，打破了原有矩阵的结构

np.apply_along_axis（）

系统中的解释 help(np.apply_along_axis)

help(np.apply_along_axis)
Help on function apply_along_axis in module numpy:

apply_along_axis(func1d, axis, arr, *args, **kwargs)
    Apply a function to 1-D slices along the given axis.
    
    Execute `func1d(a, *args, **kwargs)` where `func1d` operates on 1-D arrays
    and `a` is a 1-D slice of `arr` along `axis`.
    
    This is equivalent to (but faster than) the following use of `ndindex` and
    `s_`, which sets each of ``ii``, ``jj``, and ``kk`` to a tuple of indices::
    
        Ni, Nk = a.shape[:axis], a.shape[axis+1:]
        for ii in ndindex(Ni):
            for kk in ndindex(Nk):
                f = func1d(arr[ii + s_[:,] + kk])
                Nj = f.shape
                for jj in ndindex(Nj):
                    out[ii + jj + kk] = f[jj]
    
    Equivalently, eliminating the inner loop, this can be expressed as::
    
        Ni, Nk = a.shape[:axis], a.shape[axis+1:]
        for ii in ndindex(Ni):
            for kk in ndindex(Nk):
                out[ii + s_[...,] + kk] = func1d(arr[ii + s_[:,] + kk])
    
    Parameters
    ----------
    func1d : function (M,) -> (Nj...)
        This function should accept 1-D arrays. It is applied to 1-D
        slices of `arr` along the specified axis.
    axis : integer
        Axis along which `arr` is sliced.
    arr : ndarray (Ni..., M, Nk...)
        Input array.
    args : any
        Additional arguments to `func1d`.
    kwargs : any
        Additional named arguments to `func1d`.
    
        .. versionadded:: 1.9.0
    
    
    Returns
    -------
    out : ndarray  (Ni..., Nj..., Nk...)
        The output array. The shape of `out` is identical to the shape of
        `arr`, except along the `axis` dimension. This axis is removed, and
        replaced with new dimensions equal to the shape of the return value
        of `func1d`. So if `func1d` returns a scalar `out` will have one
        fewer dimensions than `arr`.
    
    See Also
    --------
    apply_over_axes : Apply a function repeatedly over multiple axes.
    
    Examples
    --------
    >>> def my_func(a):
    ...     """Average first and last element of a 1-D array"""
    ...     return (a[0] + a[-1]) * 0.5
    >>> b = np.array([[1,2,3], [4,5,6], [7,8,9]])
    >>> np.apply_along_axis(my_func, 0, b)
    array([4., 5., 6.])
    >>> np.apply_along_axis(my_func, 1, b)
    array([2.,  5.,  8.])
    
    For a function that returns a 1D array, the number of dimensions in
    `outarr` is the same as `arr`.
    
    >>> b = np.array([[8,1,7], [4,3,9], [5,2,6]])
    >>> np.apply_along_axis(sorted, 1, b)
    array([[1, 7, 8],
           [3, 4, 9],
           [2, 5, 6]])
    
    For a function that returns a higher dimensional array, those dimensions
    are inserted in place of the `axis` dimension.
    
    >>> b = np.array([[1,2,3], [4,5,6], [7,8,9]])
    >>> np.apply_along_axis(np.diag, -1, b)
    array([[[1, 0, 0],
            [0, 2, 0],
            [0, 0, 3]],
           [[4, 0, 0],
            [0, 5, 0],
            [0, 0, 6]],
           [[7, 0, 0],
            [0, 8, 0],
            [0, 0, 9]]])

plt.contour（）

import matplotlib.pyplot as plt 
plt.contour([X, Y,] Z, [levels], ** kwargs)

plt.contour(x, y, z, 等高线条数，colors=颜色, linewidth=线宽)#等高线绘制
plt.contourf(x, y, z, 等高线条数，cmap=颜色映射)# 等高线填充

示例1

import numpy as np
import matplotlib.pyplot as plt
x = np.arange(1, 10)
y = x.reshape(-1, 1)
h = x * y

cs = plt.contourf(h, levels=[10, 30, 50],
    colors=['#808080', '#A0A0A0', '#C0C0C0'], extend='both')
cs.cmap.set_over('red')
cs.cmap.set_under('blue')
cs.changed()

在这里插入图片描述

示例2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 
# 计算x,y坐标对应的高度值
def f(x, y):
 return (1-x/2+x**5+y**3) * np.exp(-x**2-y**2)
 
# 生成x,y的数据
n = 256
x = np.linspace(-3, 3, n)
y = np.linspace(-3, 3, n)
 
# 把x,y数据生成mesh网格状的数据，因为等高线的显示是在网格的基础上添加上高度值
X, Y = np.meshgrid(x, y)
 
# 填充等高线
plt.contourf(X, Y, f(X, Y), 20, cmap=plt.cm.hot)
# 添加等高线
C = plt.contour(X, Y, f(X, Y), 20)
plt.clabel(C, inline=True, fontsize=12)
# 显示图表
plt.show()

在这里插入图片描述

示例3

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 计算x,y坐标对应的高度值
def f(x,y):
#the height function
    return(1-x/2+x**5+y**3)*np.exp(-x**2-y**2)
# 生成x,y的数据
x = np.linspace(-4, 3, 500)
y = np.linspace(-4, 3, 500)
X, Y = np.meshgrid(x, y)# 把x,y数据生成mesh网格状的数据，因为等高线的显示是在网格的基础上添加上高度值
c = plt.contour(X, Y, f(X, Y),colors = 'm',linewidth=.5)
plt.clabel(c,inline=False,fontsize=10)#等高线标签
plt.show()

在这里插入图片描述

plt.clabel（）

clabel - 标记等高线，向CS(由contour函数返回的matplotlib.contour.ContourSet对象)中的轮廓线添加标签

clabel(CS, *args, **kwargs)

CS - 由contour函数产生的句柄对象
fontsize - string(smaller, x-large) or float ,optional
colors - Color of each label
- None, 标记的颜色为轮廓的颜色
- one string color (e.g color = ‘r’ ), 所有的标签均为红色
- a tuple of matplotlib color args (string, float, rgb, etc), 不同的标签按照指定的颜色标记
inline - bool, optional . 默认True（在标签位值移除轮廓线，也即标签覆盖轮廓线，而非穿越）
inline_spacing - float, optional，默认5，放置内联时，标签两侧留有的像素空间
fmt - string or dict, optional，默认’％1.3f '，保留小数位。1.3中的1表示输出位宽，3表示小数位长度，此时实际数据会覆盖掉该数据对应的轮廓线；当9.3时，轮廓线会被覆盖掉9个位置，同时小数点后保留3位，也就是说，轮廓线移除的长度大于数据长度
manual - bool or iterable, optional , 手动添加标签。忽略该字典
rightside_up - bool, optional，默认 True（标签旋转均以正负90度计）
use_clabeltext - bool, optional，默认False，若为True，则用 ClabelText class (instead of Text) 创建标签，当使用“CababelTress”绘制文本时，会重新计算文本的旋转角度，因此，如果轴的角度发生变化时，可以使用“CababelTress”来旋转角度。

plt.hlines（） & plt.vlines（）

hlines(y, xmin, xmax, colors='k', linestyles='solid', label='', *, data=None, **kwargs)
    Plot horizontal lines at each *y* from *xmin* to *xmax*.
    
    Parameters
    ----------
    y : scalar or sequence of scalar
        y-indexes where to plot the lines.
    
    xmin, xmax : scalar or 1D array_like
        Respective beginning and end of each line. If scalars are
        provided, all lines will have same length.
    
    colors : array_like of colors, optional, default: 'k'
    
    linestyles : {'solid', 'dashed', 'dashdot', 'dotted'}, optional
    
    label : string, optional, default: ''

示例

## 截取上面的代码
plt.hlines((u2_min,u2_max),u1_min,u1_max)
plt.vlines((u1_min,u1_max),u2_min,u2_max)
##plt.hlines((开始点1 , 开始点2) , 从哪开始画, 画到哪))

第五次作业

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

#### P83 4.3.1

#### ---------------------------------------------
##      1.    协方差相等_未知_均值向量的检验
#### ---------------------------------------------

dat= pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp4.3.1.xlsx")

X1 = dat.iloc[0:6,0:3]    
X2 = dat.iloc[6:,0:3]  

n1 = X1.shape[0]  
n2 = X2.shape[0]
p = X1.shape[1]  

X1_bar = X1.mean()
X2_bar = X2.mean()

Z_bar = X1_bar-X2_bar

S1 = X1.cov()
S2 = X2.cov()

Sp = (n1-1)/(n1+n2-2)*S1 + (n2-1)/(n1+n2-2)*S2   

T2 = n1*n2/(n1+n2)*Z_bar.T.dot(np.linalg.inv(Sp)).dot(Z_bar)

F_value = ((n1+n2-2)-p+1)/((n1+n2-2)*p) * T2

#服从F（3,11）分布  F(p,n1+n2-p-1)

alpha = 0.05

rv_f = stats.f(3,11)
F_ctri = rv_f.ppf(1-alpha)

#此时F_value < F_ctri,落在接收域内，接受H0，认为均值无差异

P_value = 1 - rv_f.cdf(F_value) ; P_value    



#### ---------------------------------------------
##      2.    协方差不相等_未知_均值向量的检验
#### ---------------------------------------------

X1_n = X1
X2_n = X2.iloc[:6,]
X2_n.sum()
X2.sum()

Z = pd.DataFrame(X1_n.values - (n1/n2)**0.5 * X2_n.values,columns=['x1','x2','x3']) + (n1*n2)**(-0.5) * X2_n.sum() - (1/n2) * X2.sum()

Z_bar = Z.mean()
Sz = Z.cov()

T2 = 6 * Z_bar.T.dot(np.linalg.inv(Sz).dot(Z_bar)) ; T2

F_value = 3/15 * T2 ;F_value 

alpha = 0.05

rv_f = stats.f(3,3)
F_ctri = rv_f.ppf(1-alpha)

# F_value < F_ctri  落在接收域内

P_value = 1 - rv_f.cdf(F_value) ; P_value    

#### ---------------------------------------------
##      3.     哪一个显著性更好
#### ---------------------------------------------
# 1 首先二者的样本的协方差相差并不多
S1 = X1.cov()
S2 = X2.cov()

# 2 协方差相等_未知_均值向量的检验 的 P 值更大
# 协方差相等_未知_均值向量的检验  P 值 ：0.2692615756289385
# 协方差不相等_未知_均值向量的检验  P 值 ：0.24551618103606898

# 故选择 协方差相等_未知_均值向量的检验 可能更好


#### P80 4.2.2

# confidence region
def conf(n, x_bar, u, S):
    return n*(x_bar-u).T.dot(np.linalg.inv(S)).dot(x_bar-u)

def dat_conf(u):
    return conf(dat_n , dat_x_bar , u , dat_S)

dat=pd.read_excel('F:\\基础数学课\\应用多元统计分析\\examp4.2.2_homework.xlsx')

dat_x_bar = dat.mean()
dat_S = dat.cov()
dat_n = dat.shape[0]
dat_p = dat.shape[1]

#### --------------------------------------------- 
####   置信区域
#### --------------------------------------------- 

U1,U2 = np.mgrid[40:110:101j,50:110:101j]   

U = np.stack((np.ravel(U1),np.ravel(U2))).T    

Z = np.apply_along_axis(dat_conf,1,U).reshape(U1.shape)

fig=plt.figure()
ax=fig.gca(projection="3d")
ax.plot_surface(U1,U1,Z)
plt.show()

plt.contour(U1,U2,Z)

rv_f = stats.f(dat_p,dat_n-dat_p)
F_crit = rv_f.ppf(0.90)
T2_crit = F_crit * (dat_p)*(dat_n-1)/(dat_n-dat_p)

ct1 = plt.contour(U1,U2,Z,[T2_crit])
plt.ylim(50,100)
plt.xlim(50,100)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")


#### --------------------------------------------- 
#### 联合置信区间
#### --------------------------------------------- 

u1_min = dat_x_bar[0] - T2_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u1_max = dat_x_bar[0] + T2_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5

u2_min = dat_x_bar[1] - T2_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
u2_max = dat_x_bar[1] + T2_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5

ct1 = plt.contour(U1,U2,Z,[T2_crit])
plt.ylim(66,90)
plt.xlim(60,85)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")
plt.hlines((u2_min,u2_max),u1_min,u1_max)
plt.vlines((u1_min,u1_max),u2_min,u2_max)

#### --------------------------------------------- 
####   Bonferroni 联合置信区间
#### --------------------------------------------- 
rv_t = stats.t(dat_n-1)
alpha = 0.1
k = 2
tk_crit = rv_t.ppf(1-alpha/(2*k))

u1_min_b = dat_x_bar[0] - tk_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5
u1_max_b = dat_x_bar[0] + tk_crit**0.5 * dat_S.iloc[0,0]**0.5/dat_n**0.5

u2_min_b = dat_x_bar[1] - tk_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5
u2_max_b = dat_x_bar[1] + tk_crit**0.5 * dat_S.iloc[1,1]**0.5/dat_n**0.5

#### --------------------------------------------- 
#### 联合置信区间 和 Bonferroni 联合置信区间
#### --------------------------------------------- 
ct1 = plt.contour(U1,U2,Z,[T2_crit]) 
plt.ylim(66,90)
plt.xlim(60,85)
plt.clabel(ct1)
plt. plot(dat_x_bar[0],dat_x_bar[1],"or")
plt.hlines((u2_min,u2_max),u1_min,u1_max)
plt.vlines((u1_min,u1_max),u2_min,u2_max)
plt.hlines((u2_min_b,u2_max_b),u1_min_b,u1_max_b,'red')
plt.vlines((u1_min_b,u1_max_b),u2_min_b,u2_max_b,'red')