R 多核并行计算包哪家强?
##### 生存分析
library(RTCGA)
#了解数据
infoTCGA <- infoTCGA() #这个命令会返回一个数据框,可以知道有哪些数据可被下载
#获得临床数据:
# Create the clinical data
# BiBiocManager::install('RTCGA.clinical')
library(RTCGA.clinical)
clin <- survivalTCGA(BRCA.clinical) #到这里临床部分的信息已经获得啦
head(clin)
# 下载基因表达信息
library(RTCGA.mRNA) #加载数据包
class(BRCA.mRNA) #查看数据类型发现是个数据框
dim(BRCA.mRNA) #看一下数据维度发现有590个样本,17815个基因
BRCA.mRNA[1:5, 1:5] #看一下部分数据样子
library(dplyr)
exprSet <- BRCA.mRNA %>%
# then make it a tibble (nice printing while debugging)
# as_tibble() %>%
# then get just a few genes,
# select(bcr_patient_barcode, PAX8, GATA3, ESR1) %>%
# then trim the barcode (see head(clin), and substr)
mutate(bcr_patient_barcode = substr(bcr_patient_barcode, 1, 12)) %>%
# then join back to clinical data
inner_join(clin, by="bcr_patient_barcode")
# 缺失值处理
# is.na(exprSet)[1:5, 1:5]
exprSet <- na.omit(exprSet)
# 查看数据
exprSet[1:4,c(1:3,(ncol(exprSet)-3):ncol(exprSet))]
exprSet[1:4,(ncol(exprSet)-2):ncol(exprSet)]
#################### version1:批量做生存分析
library(survival)
library(survminer)
system.time(my.surv <- Surv(exprSet$times, exprSet$patient.vital_status)
log_rank_p <- apply(exprSet[,2:(ncol(exprSet)-2)], 2, function(values1){
group=ifelse(values1>median(values1),'high','low')
kmfit2 <- survfit(my.surv~group,data=exprSet)
#plot(kmfit2)
data.survdiff=survdiff(my.surv~group)
p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
}))
log_rank_p.filt <- sort(log_rank_p[log_rank_p<0.05])
log_rank_p.filt
## 绘制生存曲线图
library(survminer)
group <- ifelse(exprSet$LOC374491>median(exprSet$LOC374491),'high','low')
sfit <- survfit(Surv(times, patient.vital_status)~group, data=exprSet)
ggsurvplot(sfit, conf.int=FALSE, pval=TRUE)
#################### version2:批量做生存分析
library(survival)
library(survminer)
#尝试使用并行运算
library(parallel)
#detectCores()检查当前电脑可用核数
cl.cores <- detectCores()
#makeCluster(cl.cores)使用刚才检测的核并行运算,我的服务器是28核56线程,我就用50吧
cl <- makeCluster(8)
#这是坑,parApply里面用到的函数以及变量都需要申明,不声明就必须用模块
clusterExport(cl,c("exprSet","my.surv"))
#length(names(esprSet))-2,为什么减去2,因为之前小规模测试时,我们知道最后两个是time和event,不是表达量
#注意是parApply,A要大写的
my.surv <- Surv(exprSet$times, exprSet$patient.vital_status)
system.time(log_rank_p <- parApply(cl,exprSet[,2:length(names(exprSet))-2],2,function(values){
group=ifelse(values>median(na.omit(values)),'high','low')
kmfit2 <- survival::survfit(my.surv~group,data=exprSet)
#plot(kmfit2)
data.survdiff=survival::survdiff(my.surv~group)
p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
}))
# 这个运行的时间可能就是2分钟 终止并行化
stopCluster(cl)
# 找出小于0.05的P.val
log_rank_p <- log_rank_p[log_rank_p<0.05]
# 筛选后排序,并获得基因名
gene_diff <- as.data.frame(sort(log_rank_p))
#################### version3:批量做生存分析
library(survival)
library(survminer)
library(future.apply)
# 其次,告诉R语言我要并行化
# plan(multiprocess)
plan("multiprocess", workers = 16)
rt <- exprSet
genes <- colnames(rt)[-c(1,(ncol(exprSet)-1):ncol(exprSet))]
system.time(res3 <- future_lapply(1:length(genes), function(i){
group = ifelse(rt[,genes[i]]>median(rt[,genes[i]]),"high","low")
if(length(table(group))==1) return(NULL)
surv =as.formula(paste('Surv(times, patient.vital_status)~', "group"))
data = cbind(rt[,(ncol(exprSet)-1):ncol(exprSet)],group)
x = survdiff(surv, data = data)
pValue=1-pchisq(x$chisq,df=1)
return(c(genes[i],pValue))
}))
############# version4: foreach 并行批量做生存分析
library(foreach)
library(doParallel)
# ??foreach
cl.cores <- detectCores()
cl<- makeCluster(ceiling(cl.cores*2/3))
registerDoParallel(cl)
# 计算生存分析
res4 <- data.frame()
genes <- colnames(rt)[-c(1,(ncol(exprSet)-1):ncol(exprSet))]
system.time(foreach(i=1:length(genes), .combine='c', .inorder=FALSE,.packages = c('survival','survminer')) %dopar% {
group = ifelse(rt[,genes[i]]>median(rt[,genes[i]]),"high","low")
if(length(table(group))==1) next
surv =as.formula(paste('Surv(times, patient.vital_status)~', "group"))
data = cbind(rt[,(ncol(exprSet)-1):ncol(exprSet)],group)
x = survdiff(surv, data = data)
pValue=1-pchisq(x$chisq,df=1)
res4[i,1] = genes[i]
res4[i,2] = pValue
})
stopCluster(cl) # 关闭多进程并行
names(res4) <- c("ID","pValue_log")
res4