T检验python实现

最新推荐文章于 2024-04-21 23:02:15 发布

铁松溜达py

最新推荐文章于 2024-04-21 23:02:15 发布

阅读量456

点赞数

文章标签： python 机器学习算法

本文链接：https://blog.csdn.net/book_dw5189/article/details/130799242

版权

from scipy import stats
import numpy as np
# 方差齐性检验
# 方差反映了一组数据与其平均值的偏离程度，
# 方差齐性检验用以检验两组或多组数据与其均值偏离程度是否存在差异，也是很多检验和算法的先决条件。
np.random.seed(12345678)
rvs1 = stats.norm.rvs(loc=5,scale=10,size=500)  
rvs2 = stats.norm.rvs(loc=25,scale=9,size=500)
print(stats.levene(rvs1, rvs2))

from scipy import stats
import numpy as np
# 单样本T检验，用于检验数据是否来自一致均值的总体，T检验主要是以均值为核心的检验。
# 注意以下几种T检验都是双侧T检验。
np.random.seed(12345678)
rvs = stats.norm.rvs(loc=5, scale=10, size=(100,2))
print(stats.ttest_1samp(rvs, [1, 5]))

from scipy import stats
import numpy as np
# 有于比较两组数据是否来自于同一正态分布的总体。
# 两独立样本T检验
# 注意：如果要比较的两组数据不满足方差齐性， 需要在ttest_ind()函数中添加参数equal_var = False。
np.random.seed(12345678)
rvs1 = stats.norm.rvs(loc=5,scale=10,size=500)  
rvs2 = stats.norm.rvs(loc=6,scale=10,size=500)
print(stats.ttest_ind(rvs1,rvs2))

from scipy import stats
import numpy as np
# 配对样本T检验
# 配对样本T检验可视为单样本T检验的扩展，检验的对象由一群来自正态分布独立样本更改为二群配对样本观测值之差。
# 它常用于比较同一受试对象处理的前后差异，或者按照某一条件进行两两配对分别给与不同处理的受试对象之间是否存在差异。
np.random.seed(12345678)
rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) 
rvs2 = (stats.norm.rvs(loc=5,scale=10,size=500) + stats.norm.rvs(scale=0.2,size=500)) 
print(stats.ttest_rel(rvs1,rvs2))

## Import the packages
import numpy as np
from scipy import stats

## Define 2 random distributions
#Sample Size
N= 10
#Gaussian distributed data with mean = 2 and var = 1
a= np.random.randn(N)+ 2
#Gaussian distributed data with with mean = 0 and var = 1
b= np.random.randn(N)

## Calculate the Standard Deviation
#Calculate the variance to get the standard deviation

#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1
var_a= a.var(ddof=1)
var_b= b.var(ddof=1)

#std deviation
s= np.sqrt((var_a+ var_b)/2)

## Calculate the t-statistics
t= (a.mean()- b.mean())/(s*np.sqrt(2/N))

## Compare with the critical t-value
#Degrees of freedom
df= 2*N- 2

#p-value after comparison with the t
p= 1 - stats.t.cdf(t,df=df)

print("t = " + str(t))
print("p = " + str(2*p))
#Note that we multiply the p value by 2 because its a twp tail t-test
### You can see that after comparing the t statistic with the critical t value (computed internally) we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean of the two distributions are different and statistically significant.

## Cross Checking with the internal scipy function
t2, p2= stats.ttest_ind(a,b)
print("t = " + str(t2))
print("p = " + str(2*p2))
# t分数是两个组之间的差值与组内差的比值。
# t分数越大，组间的差异越大。
# t分数越小，组间的相似度就越大。t分数为3代表这些组是彼此之间的三倍。
# t检验,比较两个平均值（均值），然后告诉你它们彼此是否有差异。并且，t检验还会告诉你这个差异有没有意义，换句话说，它让你知道这些差异是否可能是偶然发生的。

import numpy as np
from scipy import stats

# 输入数据
n = 50  # 样本大小
mean = 8000  # 样本均值
std = 1500  # 样本标准差
pop_mean = 7500  # 总体平均数
alpha = 0.05  # 显著性水平

# 计算t值和p值
t_value = (mean - pop_mean) / (std / np.sqrt(n))
p_value = stats.t.sf(np.abs(t_value), n-1)
# 大数定律：随着样本量的增大，样本均值逐渐趋于总体均值。
# 中心极限定理：样本可能不是正态分布，但是样本均值都是呈正态分布。
# rvs产生服从制定分布的随机数
# pdf概率密度函数
# cdf累计分布函数
# sf残存函数(1-cdf)
# ppf分位点函数（CDF的逆函数）
# isf逆残存函数
# fit对随机取样进行你和，最大似然估计找到的概率密度函数系数
# 判断是否拒绝原假设
if p_value < alpha:
   print("拒绝原假设，说明这50名员工的平均工资高于全国平均水平")
else:
   print("接受原假设，说明这50名员工的平均工资不高于全国平均水平")

import pandas as pd
import numpy as np
from scipy import stats
def confidence_interval_u(data, sigma=-1, alpha=0.05, side_both=True):
    xb = np.mean(data)
    s = np.std(data, ddof=1)
    if  sigma > 0: # sigma已知，枢轴量服从标准正态分布
        Z = stats.norm(loc=0, scale=1.)
        if side_both: # 求双侧置信区间
            tmp = sigma/np.sqrt(len(data))*Z.ppf(1-alpha/2)
            return (xb-tmp, xb+tmp)
        else: # 单侧置信下限或单侧置信上限
            tmp= sigma/np.sqrt(len(data))*Z.ppf(1-alpha)
            return {'bottom_limit': xb-tmp, 'top_limit': xb+tmp}
    else: # sigma未知，枢轴量服从自由度为n-1的t分布
        T = stats.t(df=len(data)-1)
        if side_both:
            tmp = s/np.sqrt(len(data))* T.ppf(1-alpha/2)
            return (xb-tmp, xb+tmp)
        else:
            tmp = s/np.sqrt(len(data))* T.ppf(1-alpha)
            return {'bottom_limit': xb-tmp, 'top_limit': xb+tmp}
data = np.array([101.3, 96.6, 100.4, 98.8, 94.6, 103.1, 102.3, 97.5, 105.4, 100.2])
confidence_interval_u(data,3)

import pandas as pd
from sklearn.datasets import load_iris
import numpy as np
from scipy import stats

iris = load_iris()
# print(iris)
data = pd.DataFrame(iris.data, columns=['sepal_length','sepal_width','petal_length','petal_width'])
# data.to_csv('iris.csv')
#计算样本均值、标准差
mean = data['petal_length'].mean()
std = data['petal_length'].std()
print('样本均值：',mean)
print('样本标准差：',std)
#计算t统计量
t = (mean-3.5)/(std/np.sqrt(len(data['petal_length'])))
print('t统计量：', t)
#计算p值
#df：自由度，即变量可以自由取值的个数
p = 2*stats.t.sf(abs(t), df=len(data['petal_length'])-1)
print('P-Value值：', p)

import pandas as pd
import numpy as np
from scipy import stats
def confidence_interval_u(data, sigma=-1, alpha=0.05, side_both=True):
    xb = np.mean(data)
    s = np.std(data, ddof=1)
    if  sigma > 0: # sigma已知，枢轴量服从标准正态分布
        Z = stats.norm(loc=0, scale=1.)
        if side_both: # 求双侧置信区间
            tmp = sigma/np.sqrt(len(data))*Z.ppf(1-alpha/2)
            return (xb-tmp, xb+tmp)
        else: # 单侧置信下限或单侧置信上限
            tmp= sigma/np.sqrt(len(data))*Z.ppf(1-alpha)
            return {'bottom_limit': xb-tmp, 'top_limit': xb+tmp}
    else: # sigma未知，枢轴量服从自由度为n-1的t分布
        T = stats.t(df=len(data)-1)
        if side_both:
            tmp = s/np.sqrt(len(data))* T.ppf(1-alpha/2)
            return (xb-tmp, xb+tmp)
        else:
            tmp = s/np.sqrt(len(data))* T.ppf(1-alpha)
            return {'bottom_limit': xb-tmp, 'top_limit': xb+tmp}
data=[107.5,110.1,105.7,106.1,104.7,105.6,107.0,406.2,107.3,106.1,
106.7,109.4,111.5,102.4,111.0,112.8,100.9,107.8,104.4,113.5,
105.7,105.9,108.5,106.3,109.4,107.7,104.7,113.6,100.9,109.3,
106.5,102.5,101.1,106.3,102.6,107.2,105.8,107.8,104.7,106.0]
u = np.mean(data)#样本的均值
confidence_interval_u(data,5,alpha=0.05, side_both=True)