#!/usr/bin/Rscript
library(survival)
library(limma)
file.create("survive.txt")
#files
tumors
for (tumor in tumors){
exp_file
#read in files
rna=read.table(exp_file,header=TRUE,row.names=1,sep="t",stringsAsFactors=FALSE)
#get the index of the normal/tumor samples
t_index
n_index
#first I remove genes whose expression is == 0 in more than 50% of the samples:
rem
x
x
r
remove dim(x)[2]*0.8)
return(remove)
}
remove
rna
#calculate z-scores :(value - mean normal)/SD normal
# scal
# mean_n
# sd_n
# res
# colnames(res)
# rownames(res)
# for(i in 1:dim(x)[1]){
# for(j in 1:dim(x)[2]){
# res[i,j]
# }
# }
# return(res)
# }
z_rna
#set the rownames and colnames
colnames(z_rna)
rm(rna)
print(paste(i,"-----------------finish first part",sep=""))
#######################################################################
# read in clinical data
clinical_file
clinical
clinical
clinical$IDs
rownames(clinical)
sum(clinical$IDs %in% colnames(z_rna))
#days_to_new_tumor_event_after_initial_treatment : isolate the max value
ind_keep
new_tum
new_tum_collapsed
for (k in 1:nrow(new_tum)){
if(sum(is.na(new_tum[k,])) < ncol(new_tum)){
m
new_tum_collapsed
} else {
new_tum_collapsed
}
}
#do the same to death: isolate the max value
ind_keep
death
death_collapsed
for (k in 1:nrow(death)){
if( sum(is.na(death[k,])) < ncol(death) ){
m
death_collapsed
} else {
death_collapsed
}
}
#days_to_last_followup: isolate the min value
ind_keep
fl
fl_collapsed
for (k in 1:nrow(fl)){
if( sum(is.na(fl[k,])) < ncol(fl) ){
m
fl_collapsed
} else {
fl_collapsed
}
}
#put everything together
all_clin
colnames(all_clin)
#now, to do survival analysis we need three main things:
#1- time: this is the time till an event happens
#2- status: this indicates which patients have to be kept for the analysis
#3- event: this tells i.e. which patients have the gene up- or down-regulated or have no changes in expression
#since we want to do censored analysis we need to have something to censor the data with.
#for example, if a patient has no death data BUT there is a date to last followup
#it means that after that day we know nothing about the patient, therefore after that day it
#cannot be used for calculations/Kaplan Meier plot anymore, therefore we censor it.
#so now we need to create vectors for both "time to new tumor" and 'time to death" that contain
#also the data from censored individuals
# create vector with time to new tumor containing data to censor for new_tumor
all_clin$new_time
for (i in 1:length(as.numeric(as.character(all_clin$new_tumor_days)))){
all_clin$new_time[i]
as.numeric(as.character(all_clin$followUp_days))[i],as.numeric(as.character(all_clin$new_tumor_days))[i])
}
# create vector time to death containing values to censor for death
all_clin$new_death
for (i in 1:length(as.numeric(as.character(all_clin$death_days)))){
all_clin$new_death[i]
as.numeric(as.character(all_clin$followUp_days))[i],as.numeric(as.character(all_clin$death_days))[i])
}
#now we need to create the vector for censoring the data which means telling R which patients are dead or have new tumor. in this case
#if a patient has a “days_to_death” it will be assigned 1, and used in the corresponding analysis. the reason why we
#censor with death events even for recurrence is pretty important. a colleague made me notice that this is a competitive
#risk problem, where, although a patient can recur and then die, if a patient is dead, it will not recur, therefore is
#more accurate to censor for death events.
#create vector for death censoring
all_clin$death_event
#finally add row.names to clinical
rownames(all_clin)
#create event vector for RNASeq data
#event_rna 1.96,1,ifelse(i< -1.96,0,2)))) #up and down 1:>1.96;0: -1.96 and <1.96
#event_rna 1.96,1,0))) # altered and not altered
event_rna
event_rna
colnames(event_rna)
rownames(event_rna)
med
for (m in 1:nrow(z_rna)){
for (n in 1:ncol(z_rna)){
event_rna[m,n] med[m],1,0)
}
}
#pick your gene
gene_list
genes
print(paste("intersect genes",intersect(genes,rownames(event_rna))))
common_samples
#Calculate each gene
for (gene in genes){
factor
time
event
#survdiff
s
s1
pv
#HR
HR
up95
low95
#high_num and low_num
high_num
low_num
#plot data
png(paste(tumor,gene,".survive.png",sep=""))
plot(s,col=c(1:3), frame=F, lwd=2, main=paste(tumor,gene,sep=":"))
# add lines for the median survival
x1
x2
if(x1 != "NA" & x2 != "NA"){
lines(c(0,x1),c(0.5,0.5),col="blue")
lines(c(x1,x1),c(0,0.5),col="black")
lines(c(x2,x2),c(0,0.5),col="red")
}
# add legend
legend(1800,0.995,legend=paste("p.value = ",pv[[1]],sep=""),bty="n",cex=1.4)
legend(0.5,1.5,legend=paste("HR = ",HR,sep=""),bty="n",cex=1.4)
legend(max(all_clin[common_samples,"new_death"],na.rm = T)*0.7,0.94,
legend=c(paste("high=",high_num),paste("low=",low_num )),bty="n",cex=1.3,lwd=3,col=c("black","red"))
dev.off()
sf_content
print(sf_content)
write.table(sf_content,"sf.survival",append=TRUE,quote=FALSE,col.names=FALSE,row.names=FALSE)
}
}
转载本文请联系原作者获取授权,同时请注明本文来自张洪磊科学网博客。
链接地址:http://blog.sciencenet.cn/blog-2609994-992077.html
上一篇:正则
下一篇:多变量计算不同水平的overlap