sklearn实战-乳腺癌细胞数据挖掘(博客主亲自录制视频教程)
https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share
医药统计项目联系:QQ231469242
#mental
group1=[2,2,3,4,4,5,3,4,4,4]
#physical
group2=[4,4,3,5,4,1,1,2,3,3]
#medical
group3=[1,2,2,2,3,2,3,1,3,1]
多重检验结果和贾俊平的LSD结果不一样,经过T配对试验,多重检验和T配对试验一致,LSD对小样本可能不准确
# -*- coding: utf-8 -*-
# Import standard packages
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
import os
# additional packages
import sys
sys.path.append(os.path.join('..', '..', 'Utilities'))
try:
# Import formatting commands if directory "Utilities" is available
from ISP_mystyle import showData
except ImportError:
# Ensure correct performance otherwise
def showData(*options):
plt.show()
return
# Other required packages
from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
MultiComparison)
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.libqsturng import psturng
def setData():
''' Set up the data, as a structured array. '''
# The first and last field are 32-bit intergers; the second field is an
# 8-byte string. Note that here we can also give names to the individual
# fields!
data = np.rec.array([
( 1, 'mental', 2 ),
( 2, 'mental', 2 ),
( 3, 'mental', 3 ),
( 4, 'mental', 4 ),
( 5, 'mental', 4 ),
( 6, 'mental', 5 ),
( 7, 'mental', 3 ),
( 8, 'mental', 4 ),
( 9, 'mental', 4 ),
( 10, 'mental', 4 ),
( 11, 'physical', 4 ),
( 12, 'physical', 4 ),
( 13, 'physical', 3 ),
( 14, 'physical', 5 ),
( 15, 'physical', 4 ),
( 16, 'physical', 1 ),
( 17, 'physical', 1 ),
( 18, 'physical', 2 ),
( 19, 'physical', 3 ),
( 20, 'physical', 3 ),
( 21, 'medical', 1 ),
( 22, 'medical', 2 ),
( 23, 'medical', 2 ),
( 24, 'medical', 2 ),
( 25, 'medical', 3 ),
( 26, 'medical', 2 ),
( 27, 'medical', 3 ),
( 28, 'medical', 1 ),
( 29, 'medical', 3 ),
( 30, 'medical', 1 )], dtype=[('idx', '<i4'),
('Treatment', '|S8'),
('StressReduction', '<i4')])
return data
def doAnova(data):
'''one-way ANOVA'''
df = pd.DataFrame(data)
model = ols('StressReduction ~ C(Treatment)',df).fit()
anovaResults = anova_lm(model)
print(anovaResults)
if anovaResults['PR(>F)'][0] < 0.05:
print('One of the groups is different.')
def doTukey(data, multiComp):
'''Do a pairwise comparison, and show the confidence intervals'''
print((multiComp.tukeyhsd().summary()))
# Calculate the p-values:
res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment'])
df = pd.DataFrame(data)
numData = len(df)
numTreatments = len(df.Treatment.unique())
dof = numData - numTreatments
# Show the group names
print((multiComp.groupsunique))
# Generate a print -------------------
# Get the data
xvals = np.arange(3)
res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment'])
errors = np.ravel(np.diff(res2.confint)/2)
# Plot them
plt.plot(xvals, res2.meandiffs, 'o')
plt.errorbar(xvals, res2.meandiffs, yerr=errors, fmt='o')
# Put on labels
pair_labels = multiComp.groupsunique[np.column_stack(res2._multicomp.pairindices)]
plt.xticks(xvals, pair_labels)
# Format the plot
xlim = -0.5, 2.5
plt.hlines(0, *xlim)
plt.xlim(*xlim)
plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
'\n Pairwise Mean Differences')
# Save to outfile, and show the data
outFile = 'multComp.png'
showData(outFile)
def Holm_Bonferroni(multiComp):
''' Instead of the Tukey's test, we can do pairwise t-test '''
# First, with the "Holm" correction
rtp = multiComp.allpairtest(stats.ttest_rel, method='Holm')
print((rtp[0]))
# and then with the Bonferroni correction
print((multiComp.allpairtest(stats.ttest_rel, method='b')[0]))
# Any value, for testing the program for correct execution
checkVal = rtp[1][0][0,0]
return checkVal
def main():
# Get the data
data = setData()
# Do a one-way ANOVA
doAnova(data)
#Then, do the multiple testing
multiComp = MultiComparison(data['StressReduction'], data['Treatment'])
doTukey(data, multiComp) # Tukey's HSD test
checkVal = Holm_Bonferroni(multiComp) # Alternatives to Tukey's HSD test
return checkVal # this is only for regression testing of the program
if __name__ == '__main__':
main()
多重检验结果和贾俊平的LSD结果不一样,经过T配对试验,多重检验和T配对试验一致,LSD对小样本可能不准确
# -*- coding: utf-8 -*-
import scipy,math
from scipy.stats import f
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
# additional packages
from statsmodels.stats.diagnostic import lillifors
#多重比较
from statsmodels.sandbox.stats.multicomp import multipletests
#用于排列组合
import itertools
#独立T检验
from scipy.stats import ttest_ind
#配对T检验
from scipy.stats import ttest_rel
#mental
group1=[2,2,3,4,4,5,3,4,4,4]
#physical
group2=[4,4,3,5,4,1,1,2,3,3]
#medical
group3=[1,2,2,2,3,2,3,1,3,1]
list_groups=[group1,group2,group3]
list_total=group1+group2+group3
a=0.05
#one within group error,also know as random error
def SE(group):
se=0
mean1=np.mean(group)
for i in group:
error=i-mean1
se+=error**2
return se
'''
>>> SE(group1)
22.0
>>> SE(group2)
18.0
>>> SE(group3)
14.0
'''
#sum of squares within group error,also know as random error
def SSE(list_groups):
sse=0
for group in list_groups:
se=SE(group)
sse+=se
return sse
#误差总平方和
def SST(list_total):
sst=0
mean1=np.mean(list_total)
for i in list_total:
error=i-mean1
sst+=error**2
return sst
#SSA,between-group sum of squares 组间平方和,公式1:ssa=sst-sse
def SSA(list_groups,list_total):
sse=SSE(list_groups)
sst=SST(list_total)
ssa=sst-sse
return ssa
#SSA,between-group sum of squares 组间平方和
def SSA1(list_groups,list_total):
mean_total=np.mean(list_total)
ssa=0
for group in list_groups:
group_mean=np.mean(group)
distance=(mean_total-group_mean)**2
ssa+=distance
ssa=ssa*5
return ssa
#处理效应均方
def MSA(list_groups,list_total):
ssa=SSA(list_groups,list_total)
msa=ssa/(len(list_groups)-1)*1.0
return msa
# 误差均方
def MSE(list_groups,list_total):
sse=SSE(list_groups)
mse=sse/(len(list_total)-1*len(list_groups))*1.0
return mse
#F score
def F(list_groups,list_total):
msa=MSA(list_groups,list_total)
mse=MSE(list_groups,list_total)
ratio=msa/mse*1.0
return ratio
'''
>>> F(list_groups,list_total)
22.592592592592595
'''
#LSD检验有问题,需要核对,不如配对T检验准确
#LSD(least significant difference)最小显著差异
def LSD(list_groups,list_total,sample1,sample2):
mean1=np.mean(sample1)
mean2=np.mean(sample2)
distance=abs(mean1-mean2)
print"distance:",distance
#t检验的自由度
df=len(list_total)-1*len(list_groups)
mse=MSE(list_groups,list_total)
print"MSE:",mse
t_value=stats.t(df).isf(a/2)
print"t value:",t_value
lsd=t_value*math.sqrt(mse*(1.0/len(sample1)+1.0/len(sample2)))
print "LSD:",lsd
if distance<lsd:
print"no significant difference between:",sample1,sample2
else:
print"there is significant difference between:",sample1,sample2
#正态分布测试
def check_normality(testData):
#20<样本数<50用normal test算法检验正态分布性
if 20<len(testData) <50:
p_value= stats.normaltest(testData)[1]
if p_value<0.05:
print"use normaltest"
print "data are not normal distributed"
return False
else:
print"use normaltest"
print "data are normal distributed"
return True
#样本数小于50用Shapiro-Wilk算法检验正态分布性
if len(testData) <50:
p_value= stats.shapiro(testData)[1]
if p_value<0.05:
print "use shapiro:"
print "data are not normal distributed"
return False
else:
print "use shapiro:"
print "data are normal distributed"
return True
if 300>=len(testData) >=50:
p_value= lillifors(testData)[1]
if p_value<0.05:
print "use lillifors:"
print "data are not normal distributed"
return False
else:
print "use lillifors:"
print "data are normal distributed"
return True
if len(testData) >300:
p_value= stats.kstest(testData,'norm')[1]
if p_value<0.05:
print "use kstest:"
print "data are not normal distributed"
return False
else:
print "use kstest:"
print "data are normal distributed"
return True
#对所有样本组进行正态性检验
def NormalTest(list_groups):
for group in list_groups:
#正态性检验
status=check_normality(group1)
if status==False :
return False
#排列组合函数
def Combination(list_groups):
combination= []
for i in range(1,len(list_groups)+1):
iter = itertools.combinations(list_groups,i)
combination.append(list(iter))
#需要排除第一个和最后一个
return combination[1:-1][0]
'''
Out[57]:
[[([2, 3, 7, 2, 6], [10, 8, 7, 5, 10]),
([2, 3, 7, 2, 6], [10, 13, 14, 13, 15]),
([10, 8, 7, 5, 10], [10, 13, 14, 13, 15])]]
'''
#多重比较
def Multiple_test(list_groups):
combination=Combination(list_groups)
for pair in combination:
LSD(list_groups,list_total,pair[0],pair[1])
#discriptive statistcs
print "discriptive statistcs----------------------------------------------"
print "group1 mean",np.mean(group1)
print "group2 mean",np.mean(group2)
print "group3 mean",np.mean(group3)
#对所有样本组进行正态性检验
print"M=Normality test:-----------------------------------"
NormalTest(list_groups)
#方差齐性检测
print"levene test:-----------------------------------"
leveneResult=scipy.stats.levene(group1,group2,group3)
leveneScore=leveneResult[0]
p_levene=leveneResult[1]
if p_levene<0.05:
print"levene test is not fit,be attention!"
else:
print"levene test is ok"
'''
H0成立,三组数据方差无显著差异
Out[9]: LeveneResult(statistic=0.24561403508771934, pvalue=0.7860617221429711)
'''
print "result--------------------------------------------------"
f_score=F(list_groups,list_total)
print"F score:",f_score
#sf 为生存函数survival function
#组数自由度
df1=len(list_groups)-1
#所有样本的自由度
df2=len(list_total)-1*len(list_groups)
probability=f.sf(f_score,df1,df2)
print"p value:",probability
'''
Out[28]: 8.5385924542746692e-05
'''
if probability<0.05:
print"there is significance,H1 win"
else:
print"there is no significance,H0 win"
#多重比较
print"multiple test----------------------------------------------"
print"Multiple test",Multiple_test
Multiple_test(list_groups)
贾俊平书里的例子,综合多重检验和LSD方法结果一致
#居民区
group1=[265,310,220,290,350,300,445,480,500,430,428,530]
#商业区
group2=[410,305,450,380,310,390,590,480,510,470,415,390]
#写字楼
group3=[180,290,330,220,170,256,290,283,260,246,275,320]
# -*- coding: utf-8 -*-
# Import standard packages
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
import os
# additional packages
import sys
sys.path.append(os.path.join('..', '..', 'Utilities'))
try:
# Import formatting commands if directory "Utilities" is available
from ISP_mystyle import showData
except ImportError:
# Ensure correct performance otherwise
def showData(*options):
plt.show()
return
# Other required packages
from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
MultiComparison)
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.libqsturng import psturng
def setData():
''' Set up the data, as a structured array. '''
# The first and last field are 32-bit intergers; the second field is an
# 8-byte string. Note that here we can also give names to the individual
# fields!
data = np.rec.array([
( 1, 'resident', 265 ),
( 2, 'resident', 310 ),
( 3, 'resident', 220 ),
( 4, 'resident', 290 ),
( 5, 'resident', 350 ),
( 6, 'resident', 300 ),
( 7, 'resident', 445 ),
( 8, 'resident', 480 ),
( 9, 'resident', 500 ),
( 10, 'resident', 430),
( 11, 'resident', 428),
( 12, 'resident', 530 ),
( 13, 'commercial', 410 ),
( 14, 'commercial', 305 ),
( 15, 'commercial', 450 ),
( 16, 'commercial', 380 ),
( 17, 'commercial', 310 ),
( 18, 'commercial', 390 ),
( 19, 'commercial', 590 ),
( 20, 'commercial', 480 ),
( 21, 'commercial', 510 ),
( 22, 'commercial', 470 ),
( 23, 'commercial', 415 ),
( 24, 'commercial', 390 ),
( 25, 'officeBuilding', 180 ),
( 26, 'officeBuilding', 290 ),
( 27, 'officeBuilding', 330 ),
( 28, 'officeBuilding', 220 ),
( 29, 'officeBuilding', 170),
( 30, 'officeBuilding', 256),
( 31, 'officeBuilding', 290 ),
( 32, 'officeBuilding', 283 ),
( 33, 'officeBuilding', 260),
( 34, 'officeBuilding', 246),
( 35, 'officeBuilding', 275),
( 36, 'officeBuilding', 320),
], dtype=[('idx', '<i4'),
('Treatment', '|S8'),
('StressReduction', '<i4')])
return data
def doAnova(data):
'''one-way ANOVA'''
df = pd.DataFrame(data)
model = ols('StressReduction ~ C(Treatment)',df).fit()
anovaResults = anova_lm(model)
print(anovaResults)
if anovaResults['PR(>F)'][0] < 0.05:
print('One of the groups is different.')
def doTukey(data, multiComp):
'''Do a pairwise comparison, and show the confidence intervals'''
print((multiComp.tukeyhsd().summary()))
# Calculate the p-values:
res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment'])
df = pd.DataFrame(data)
numData = len(df)
numTreatments = len(df.Treatment.unique())
dof = numData - numTreatments
# Show the group names
print((multiComp.groupsunique))
# Generate a print -------------------
# Get the data
xvals = np.arange(3)
res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment'])
errors = np.ravel(np.diff(res2.confint)/2)
# Plot them
plt.plot(xvals, res2.meandiffs, 'o')
plt.errorbar(xvals, res2.meandiffs, yerr=errors, fmt='o')
# Put on labels
pair_labels = multiComp.groupsunique[np.column_stack(res2._multicomp.pairindices)]
plt.xticks(xvals, pair_labels)
# Format the plot
xlim = -0.5, 2.5
plt.hlines(0, *xlim)
plt.xlim(*xlim)
plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
'\n Pairwise Mean Differences')
# Save to outfile, and show the data
outFile = 'multComp.png'
showData(outFile)
def Holm_Bonferroni(multiComp):
''' Instead of the Tukey's test, we can do pairwise t-test '''
# First, with the "Holm" correction
rtp = multiComp.allpairtest(stats.ttest_rel, method='Holm')
print((rtp[0]))
# and then with the Bonferroni correction
print((multiComp.allpairtest(stats.ttest_rel, method='b')[0]))
# Any value, for testing the program for correct execution
checkVal = rtp[1][0][0,0]
return checkVal
def main():
# Get the data
data = setData()
# Do a one-way ANOVA
doAnova(data)
#Then, do the multiple testing
multiComp = MultiComparison(data['StressReduction'], data['Treatment'])
doTukey(data, multiComp) # Tukey's HSD test
checkVal = Holm_Bonferroni(multiComp) # Alternatives to Tukey's HSD test
return checkVal # this is only for regression testing of the program
if __name__ == '__main__':
main()
贾俊平书里的例子,综合多重检验和LSD方法结果一致
#居民区
group1=[265,310,220,290,350,300,445,480,500,430,428,530]
#商业区
group2=[410,305,450,380,310,390,590,480,510,470,415,390]
#写字楼
group3=[180,290,330,220,170,256,290,283,260,246,275,320]
# -*- coding: utf-8 -*-
import scipy,math
from scipy.stats import f
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
# additional packages
from statsmodels.stats.diagnostic import lillifors
#多重比较
from statsmodels.sandbox.stats.multicomp import multipletests
#用于排列组合
import itertools
#独立T检验
from scipy.stats import ttest_ind
#配对T检验
from scipy.stats import ttest_rel
#居民区
group1=[265,310,220,290,350,300,445,480,500,430,428,530]
#商业区
group2=[410,305,450,380,310,390,590,480,510,470,415,390]
#写字楼
group3=[180,290,330,220,170,256,290,283,260,246,275,320]
list_groups=[group1,group2,group3]
list_total=group1+group2+group3
a=0.05
#one within group error,also know as random error
def SE(group):
se=0
mean1=np.mean(group)
for i in group:
error=i-mean1
se+=error**2
return se
'''
>>> SE(group1)
22.0
>>> SE(group2)
18.0
>>> SE(group3)
14.0
'''
#sum of squares within group error,also know as random error
def SSE(list_groups):
sse=0
for group in list_groups:
se=SE(group)
sse+=se
return sse
#误差总平方和
def SST(list_total):
sst=0
mean1=np.mean(list_total)
for i in list_total:
error=i-mean1
sst+=error**2
return sst
#SSA,between-group sum of squares 组间平方和,公式1:ssa=sst-sse
def SSA(list_groups,list_total):
sse=SSE(list_groups)
sst=SST(list_total)
ssa=sst-sse
return ssa
#SSA,between-group sum of squares 组间平方和
def SSA1(list_groups,list_total):
mean_total=np.mean(list_total)
ssa=0
for group in list_groups:
group_mean=np.mean(group)
distance=(mean_total-group_mean)**2
ssa+=distance
ssa=ssa*5
return ssa
#处理效应均方
def MSA(list_groups,list_total):
ssa=SSA(list_groups,list_total)
msa=ssa/(len(list_groups)-1)*1.0
return msa
# 误差均方
def MSE(list_groups,list_total):
sse=SSE(list_groups)
mse=sse/(len(list_total)-1*len(list_groups))*1.0
return mse
#F score
def F(list_groups,list_total):
msa=MSA(list_groups,list_total)
mse=MSE(list_groups,list_total)
ratio=msa/mse*1.0
return ratio
'''
>>> F(list_groups,list_total)
22.592592592592595
'''
#LSD检验有问题,需要核对,不如配对T检验准确
#LSD(least significant difference)最小显著差异
def LSD(list_groups,list_total,sample1,sample2):
mean1=np.mean(sample1)
mean2=np.mean(sample2)
distance=abs(mean1-mean2)
print"distance:",distance
#t检验的自由度
df=len(list_total)-1*len(list_groups)
mse=MSE(list_groups,list_total)
print"MSE:",mse
t_value=stats.t(df).isf(a/2)
print"t value:",t_value
lsd=t_value*math.sqrt(mse*(1.0/len(sample1)+1.0/len(sample2)))
print "LSD:",lsd
if distance<lsd:
print"no significant difference between:",sample1,sample2
else:
print"there is significant difference between:",sample1,sample2
#正态分布测试
def check_normality(testData):
#20<样本数<50用normal test算法检验正态分布性
if 20<len(testData) <50:
p_value= stats.normaltest(testData)[1]
if p_value<0.05:
print"use normaltest"
print "data are not normal distributed"
return False
else:
print"use normaltest"
print "data are normal distributed"
return True
#样本数小于50用Shapiro-Wilk算法检验正态分布性
if len(testData) <50:
p_value= stats.shapiro(testData)[1]
if p_value<0.05:
print "use shapiro:"
print "data are not normal distributed"
return False
else:
print "use shapiro:"
print "data are normal distributed"
return True
if 300>=len(testData) >=50:
p_value= lillifors(testData)[1]
if p_value<0.05:
print "use lillifors:"
print "data are not normal distributed"
return False
else:
print "use lillifors:"
print "data are normal distributed"
return True
if len(testData) >300:
p_value= stats.kstest(testData,'norm')[1]
if p_value<0.05:
print "use kstest:"
print "data are not normal distributed"
return False
else:
print "use kstest:"
print "data are normal distributed"
return True
#对所有样本组进行正态性检验
def NormalTest(list_groups):
for group in list_groups:
#正态性检验
status=check_normality(group1)
if status==False :
return False
#排列组合函数
def Combination(list_groups):
combination= []
for i in range(1,len(list_groups)+1):
iter = itertools.combinations(list_groups,i)
combination.append(list(iter))
#需要排除第一个和最后一个
return combination[1:-1][0]
'''
Out[57]:
[[([2, 3, 7, 2, 6], [10, 8, 7, 5, 10]),
([2, 3, 7, 2, 6], [10, 13, 14, 13, 15]),
([10, 8, 7, 5, 10], [10, 13, 14, 13, 15])]]
'''
#多重比较
def Multiple_test(list_groups):
combination=Combination(list_groups)
for pair in combination:
LSD(list_groups,list_total,pair[0],pair[1])
#discriptive statistcs
print "discriptive statistcs----------------------------------------------"
print "group1 mean",np.mean(group1)
print "group2 mean",np.mean(group2)
print "group3 mean",np.mean(group3)
#对所有样本组进行正态性检验
print"M=Normality test:-----------------------------------"
NormalTest(list_groups)
#方差齐性检测
print"levene test:-----------------------------------"
leveneResult=scipy.stats.levene(group1,group2,group3)
leveneScore=leveneResult[0]
p_levene=leveneResult[1]
if p_levene<0.05:
print"levene test is not fit,be attention!"
else:
print"levene test is ok"
'''
H0成立,三组数据方差无显著差异
Out[9]: LeveneResult(statistic=0.24561403508771934, pvalue=0.7860617221429711)
'''
print "result--------------------------------------------------"
f_score=F(list_groups,list_total)
print"F score:",f_score
#sf 为生存函数survival function
#组数自由度
df1=len(list_groups)-1
#所有样本的自由度
df2=len(list_total)-1*len(list_groups)
probability=f.sf(f_score,df1,df2)
print"p value:",probability
'''
Out[28]: 8.5385924542746692e-05
'''
if probability<0.05:
print"there is significance,H1 win"
else:
print"there is no significance,H0 win"
#多重比较
print"multiple test----------------------------------------------"
print"Multiple test",Multiple_test
Multiple_test(list_groups)