php 生存分析,科学网—生存分析案例 - 张洪磊的博文-CSDN博客

#!/usr/bin/Rscript

library(survival)

library(limma)

file.create("survive.txt")

#files

tumors

for (tumor in tumors){

exp_file

#read in files

rna=read.table(exp_file,header=TRUE,row.names=1,sep="t",stringsAsFactors=FALSE)

#get the index of the normal/tumor samples

t_index

n_index

#first I remove genes whose expression is == 0 in more than 50% of the samples:

rem

remove dim(x)[2]*0.8)

return(remove)

}

remove

rna

#calculate z-scores :(value - mean normal)/SD normal

# scal

# mean_n

# sd_n

# res

# colnames(res)

# rownames(res)

# for(i in 1:dim(x)[1]){

# for(j in 1:dim(x)[2]){

# res[i,j]

# }

# return(res)

# }

z_rna

#set the rownames and colnames

colnames(z_rna)

rm(rna)

print(paste(i,"-----------------finish first part",sep=""))

#######################################################################

# read in clinical data

clinical_file

clinical

clinical$IDs

rownames(clinical)

sum(clinical$IDs %in% colnames(z_rna))

#days_to_new_tumor_event_after_initial_treatment : isolate the max value

ind_keep

new_tum

new_tum_collapsed

for (k in 1:nrow(new_tum)){

if(sum(is.na(new_tum[k,])) < ncol(new_tum)){

new_tum_collapsed

} else {

new_tum_collapsed

}

#do the same to death: isolate the max value

ind_keep

death

death_collapsed

for (k in 1:nrow(death)){

if( sum(is.na(death[k,])) < ncol(death) ){

death_collapsed

} else {

death_collapsed

}

#days_to_last_followup: isolate the min value

ind_keep

fl_collapsed

for (k in 1:nrow(fl)){

if( sum(is.na(fl[k,])) < ncol(fl) ){

fl_collapsed

} else {

fl_collapsed

}

#put everything together

all_clin

colnames(all_clin)

#now, to do survival analysis we need three main things:

#1- time: this is the time till an event happens

#2- status: this indicates which patients have to be kept for the analysis

#3- event: this tells i.e. which patients have the gene up- or down-regulated or have no changes in expression

#since we want to do censored analysis we need to have something to censor the data with.

#for example, if a patient has no death data BUT there is a date to last followup

#it means that after that day we know nothing about the patient, therefore after that day it

#cannot be used for calculations/Kaplan Meier plot anymore, therefore we censor it.

#so now we need to create vectors for both "time to new tumor" and 'time to death" that contain

#also the data from censored individuals

# create vector with time to new tumor containing data to censor for new_tumor

all_clin$new_time

for (i in 1:length(as.numeric(as.character(all_clin$new_tumor_days)))){

all_clin$new_time[i]

as.numeric(as.character(all_clin$followUp_days))[i],as.numeric(as.character(all_clin$new_tumor_days))[i])

}

# create vector time to death containing values to censor for death

all_clin$new_death

for (i in 1:length(as.numeric(as.character(all_clin$death_days)))){

all_clin$new_death[i]

as.numeric(as.character(all_clin$followUp_days))[i],as.numeric(as.character(all_clin$death_days))[i])

}

#now we need to create the vector for censoring the data which means telling R which patients are dead or have new tumor. in this case

#if a patient has a “days_to_death” it will be assigned 1, and used in the corresponding analysis. the reason why we

#censor with death events even for recurrence is pretty important. a colleague made me notice that this is a competitive

#risk problem, where, although a patient can recur and then die, if a patient is dead, it will not recur, therefore is

#more accurate to censor for death events.

#create vector for death censoring

all_clin$death_event

#finally add row.names to clinical

rownames(all_clin)

#create event vector for RNASeq data

#event_rna 1.96,1,ifelse(i< -1.96,0,2)))) #up and down 1:>1.96;0: -1.96 and <1.96

#event_rna 1.96,1,0))) # altered and not altered

event_rna

colnames(event_rna)

rownames(event_rna)

med

for (m in 1:nrow(z_rna)){

for (n in 1:ncol(z_rna)){

event_rna[m,n] med[m],1,0)

}

#pick your gene

gene_list

genes

print(paste("intersect genes",intersect(genes,rownames(event_rna))))

common_samples

#Calculate each gene

for (gene in genes){

factor

time

event

#survdiff

#HR

up95

low95

#high_num and low_num

high_num

low_num

#plot data

png(paste(tumor,gene,".survive.png",sep=""))

plot(s,col=c(1:3), frame=F, lwd=2, main=paste(tumor,gene,sep=":"))

# add lines for the median survival

if(x1 != "NA" & x2 != "NA"){

lines(c(0,x1),c(0.5,0.5),col="blue")

lines(c(x1,x1),c(0,0.5),col="black")

lines(c(x2,x2),c(0,0.5),col="red")

}

# add legend

legend(1800,0.995,legend=paste("p.value = ",pv[[1]],sep=""),bty="n",cex=1.4)

legend(0.5,1.5,legend=paste("HR = ",HR,sep=""),bty="n",cex=1.4)

legend(max(all_clin[common_samples,"new_death"],na.rm = T)*0.7,0.94,

legend=c(paste("high=",high_num),paste("low=",low_num )),bty="n",cex=1.3,lwd=3,col=c("black","red"))

dev.off()

sf_content

print(sf_content)

write.table(sf_content,"sf.survival",append=TRUE,quote=FALSE,col.names=FALSE,row.names=FALSE)

}

转载本文请联系原作者获取授权，同时请注明本文来自张洪磊科学网博客。

链接地址：http://blog.sciencenet.cn/blog-2609994-992077.html

上一篇：正则

下一篇：多变量计算不同水平的overlap