sklearn实战-乳腺癌细胞数据挖掘(博主亲自录视频)
医药统计项目联系QQ:231469242
python 2.7
# -*- coding: utf-8 -*-
from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
MultiComparison)
# Import standard packages
import numpy as np
from scipy import stats
import pandas as pd
import variance_check
#数据excel名
excel="sample.xlsx"
#读取数据
df=pd.read_excel(excel)
#获取第一组数据,结构为列表
group_mentaln=list(df.StressReduction[(df.Treatment=="mental")])
group_physical=list(df.StressReduction[(df.Treatment=="physical")])
group_medical=list(df.StressReduction[(df.Treatment=="medical")])
list_groups=[group_mentaln,group_physical,group_medical]
list_total=group_mentaln+group_physical+group_medical
print"equal test-----------------------------------------------------"
# #比较组内的样本是否相等,如果不相等,不适合于tukey等方法
equal_lenth=variance_check.Equal_lenth(list_groups)
if equal_lenth==False:
print("the length of groups are not equal")
multiComp = MultiComparison(df['StressReduction'], df['Treatment'])
tukey=multiComp.tukeyhsd()
summary=multiComp.tukeyhsd().summary()
print(summary)
q=tukey.q_crit
print("q values:",q)
'''
q值
Out[41]: 3.5057698487864877
'''
'''
Multiple Comparison of Means - Tukey HSD,FWER=0.05
===============================================
group1 group2 meandiff lower upper reject
-----------------------------------------------
medical mental 1.5 0.3217 2.6783 True
medical physical 1.0 -0.1783 2.1783 False
mental physical -0.5 -1.6783 0.6783 False
-----------------------------------------------
'''
print("data details:",summary.data)
'''
[['group1', 'group2', 'meandiff', 'lower', 'upper', 'reject'],
[u'medical', u'mental', 1.5, 0.32169999999999999, 2.6783000000000001, True],
[u'medical', u'physical', 1.0, -0.17829999999999999, 2.1783000000000001, False],
[u'mental', u'physical', -0.5, -1.6782999999999999, 0.67830000000000001, False]]
'''
variance_check.py
# -*- coding: utf-8 -*-
'''
用于方差齐性检验
正太性检验
配对相等检验
'''
import scipy,math
from scipy.stats import f
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
# additional packages
from statsmodels.stats.diagnostic import lillifors
#多重比较
from statsmodels.sandbox.stats.multicomp import multipletests
#用于排列组合
import itertools
'''
#测试数据
group1=[2,3,7,2,6]
group2=[10,8,7,5,10]
group3=[10,13,14,13,15]
list_groups=[group1,group2,group3]
list_total=group1+group2+group3
'''
a=0.05
#正态分布测试
def check_normality(testData):
#20<样本数<50用normal test算法检验正态分布性
if 20<len(testData) <50:
p_value= stats.normaltest(testData)[1]
if p_value<0.05:
print"use normaltest"
print "data are not normal distributed"
return False
else:
print"use normaltest"
print "data are normal distributed"
return True
#样本数小于50用Shapiro-Wilk算法检验正态分布性
if len(testData) <50:
p_value= stats.shapiro(testData)[1]
if p_value<0.05:
print "use shapiro:"
print "data are not normal distributed"
return False
else:
print "use shapiro:"
print "data are normal distributed"
return True
if 300>=len(testData) >=50:
p_value= lillifors(testData)[1]
if p_value<0.05:
print "use lillifors:"
print "data are not normal distributed"
return False
else:
print "use lillifors:"
print "data are normal distributed"
return True
if len(testData) >300:
p_value= stats.kstest(testData,'norm')[1]
if p_value<0.05:
print "use kstest:"
print "data are not normal distributed"
return False
else:
print "use kstest:"
print "data are normal distributed"
return True
#对所有样本组进行正态性检验
def NormalTest(list_groups):
for group in list_groups:
#正态性检验
status&