注意!!!!!
第一行的路径用自己文件保存位置的路径
不知道的话,右键文件点开属性,复制在????处就行了。记得把"\"改成"/"
path_file<- "????????/data_for_assignment4.rds"
list_data<-readRDS(path_file)
amess <- list_data$q1_4[[1]]
bcStats <- list_data$q1_4[[2]]
trials <-list_data$q1_4[[3]]
#Q1
mod <- aov(folate~factor(treatmnt),data=amess)
mod
summary(aov(mod))
# 为什么更有效?
# 控制I型错误:当比较多个组时,单独的t检验会增加I型错误(错误地拒绝了正确的零假设)的风险。方差分析通过一次分析所有组来控制这种风险。
# 统计功效:方差分析通常比单独的t检验具有更高的统计功效,因为它使用所有可用数据来估计变异性。
# 多重比较问题:如果进行多个单独的t检验,需要进行多重比较校正。方差分析本身就包含了对多重比较的校正,因此不需要额外的校正步骤。
#Q2
pairwise.t.test(amess$folate, amess$treatmnt,p.adjust.method = "holm")
#乳腺癌发病率
pacman::p_load("rmarkdown")
paged_table(bcStats)
#Q3
pairwise.wilcox.test(bcStats$NewCasesOfBreastCancerIn2002,
bcStats$continent, p.adjust.method = "bonferroni",exact= FALSE)
#美洲和非洲,欧洲和非洲,欧洲和美洲,欧洲和亚洲有显著性差异
#Linear Regression
paged_table(trials)
x <- trials$Drug.concentration
y <- trials$Cell.Count
plot(x,y,cex=0.8)
text(x,y+0.2,labels=1:length(x))
model <- lm(y ~x)
par(mfrow=c(1,1))
plot(x,y,pch=16)
points(x,model$fitted.values,pch=16,col="grey")
arrows(x,model$fitted.values,x,y,code=0,col="grey")
abline(model,col="red",lty=2)
text(x,y+0.2,labels=1:length(x),cex=0.8,col="steelblue")
# Q4 summary(model)
par(mfrow=c(2,2))
plot(model)
#对数据进行分析后发现,其中10,15两个数据点误差较大。
#在Residuals vs Fitted中,3号、10号和15号残差均比较大,且10,15离拟合线较远
# 在Standardized residuals vs Observations中,看可以看出来,数据间相对不独立,有线性逻辑关系,但是10,15两个点离拟合出来的线较远
# Cook's distance vs Observations用于检测数据中的异常值,很清晰的我们可以发现,10,15两个点不分布在红线周围
# Residuals vs Leverage用于识别有高杠杆值的点,我们发现3点距离其他点远,杠杆值高,对数据影响大。
#Q5
pacman::p_load("ggpubr")
pacman::p_load("factoextra")
trials$Drug.concentration <- as.numeric(trials$Drug.concentration)
trials$Cell.Count <- as.numeric(trials$Cell.Count)
trials <- na.omit(trials) # 移除含有 NA 的行
set.seed(123)
res.km <- kmeans(scale(trials[,c(2,3)]), 3, nstart = 25)
fviz_cluster(res.km, data = trials[2-3])
#Q6
rowInd <- match(c("PILRA", "NPFFR2", "GPATCH3", "IFIT3", "KCNH1", "AUTS2", "ITGA9", "ARR3"), geneAnnotation$HUGO.gene.symbol)
geneAnnotation[rowInd,]
pacman::p_load('genefilter')
pacman::p_load('matrixStats')
# the rowSds function will calculate the standard deviation for each row in a numeric matrix
geneVar <- rowSds(as.matrix(normalizedValues))
library(RColorBrewer)
sampcol <- rep("blue", ncol(normalizedValues))
sampcol[patientMetadata$er == 1 ] <- "yellow"
rbPal <- brewer.pal(10, "RdBu")
genelist<-geneAnnotation$HUGO.gene.symbol
normalizedValues_matrix<-as.matrix(normalizedValues)
heatmap(normalizedValues_matrix,
ColSideColors = sampcol,
col=rbPal,
labRow = genelist,
labCol="")
genelist <- c("PILRA", "NPFFR2", "GPATCH3", "IFIT3", "KCNH1", "AUTS2", "ITGA9", "ARR3")
probes <- geneAnnotation$probe[match(genelist, geneAnnotation$HUGO.gene.symbol)]
exprows <- match(probes, rownames(normalizedValues))
heatmap(as.matrix(normalizedValues[exprows,]))
library(RColorBrewer)
sampcol <- rep("blue", ncol(normalizedValues))
sampcol[patientMetadata$er == 1 ] <- "yellow"
rbPal <- brewer.pal(10, "RdBu")
heatmap(as.matrix(normalizedValues[exprows,]),
ColSideColors = sampcol,
col=rbPal,
labRow = genelist,labCol="")
# hint:
chr8Genes <- geneAnnotation[geneAnnotation$Chromosome=="chr8",]
head(chr8Genes)
knitr::kable(chr8Genes)
chr8GenesOrd <- chr8Genes[order(chr8Genes$Start),]
head(chr8GenesOrd)
match(chr8GenesOrd$probe, rownames(normalizedValues))
chr8Expression <- normalizedValues[match(chr8GenesOrd$probe, rownames(normalizedValues)),]
dim(chr8Expression)
ngenes <- nrow(chr8Expression)
pdf("Chromosome8Genes.pdf")
pvals <- NULL
# hint:
for(i in 1:nrow(chr8Expression)) {
tmp <- t.test(as.numeric(chr8Expression[i,]) ~ factor(patientMetadata$er))
pvals[i] <- tmp$p.value
if(tmp$p.value < 0.05){
boxplot(as.numeric(chr8Expression[i,]) ~ factor(patientMetadata$er),
main=chr8Genes$HUGO.gene.symbol[i])
}
}
pvals
dev.off()
chr8Genes[pvals<0.05,]$HUGO.gene.symbol
pvals
sum(pvals < 0.05)
第六题我跑不出来,那个库下不下来,希望有大神指点