CIBERSORT 免疫浸润,参考这篇
非常清楚的写出了输入数据的要求:
1.不可以有负值和缺失值
2.不要取log
3.如果是芯片数据,昂飞芯片使用RMA标准化,Illumina 的Beadchip 和Agilent的单色芯片,用limma处理
4.如果是RNA-Seq表达量,使用FPKM和TPM都很合适。
rm(list = ls())
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) #对应清华源
options(BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor/") #对应清华源
exprMatrix = read.table(file = './combined_RNA_Expr.txt',header=TRUE,row.names=1, as.is=TRUE)
# process Ensembl ID ------------------------------------------------------
Ensemble_ID<- rownames(exprMatrix)
ID<- strsplit(Ensemble_ID, "[.]")
IDlast<- sapply(ID, "[", 1)
exprMatrix$ensembl_id<- IDlast
row.names(exprMatrix)<- exprMatrix$ensembl_id
# save(exprMatrix, file = 'TCGA.Rdata')
# load(file = 'TCGA.Rdata')
# trans to symbol ---------------------------------------------------------
# library(org.Hs.eg.db)
# ls("package:org.Hs.eg.db")
# g2s=toTable(org.Hs.egSYMBOL);head(g2s)
# g2e=toTable(org.Hs.egENSEMBL);head(g2e)
# tmp=merge(g2e,g2s,by='gene_id')
# head(tmp)
#
# exprMatrix<- merge(tmp,exprMatrix,by='ensembl_id')
# exprMatrix<- exprMatrix[,- c(1,2)]
#
# exprMatrix=exprMatrix[!duplicated(exprMatrix$symbol),]
# row.names(exprMatrix)<- exprMatrix[,1];exprMatrix<- exprMatrix[,-1]
library(tinyarray)
exprMatrix <- exprMatrix[,-ncol(exprMatrix)]
exp = trans_exp(exprMatrix,mrna_only = T)
# trans to TPM ------------------------------------------------------------
if(T){
library(rtracklayer)
gtf = rtracklayer::import("gencode.v22.annotation.gtf.gz");class(gtf)
# https://www.encodeproject.org/files/gencode.v22.annotation/ 在这里下载
gtf = as.data.frame(gtf);dim(gtf);table(gtf$type)
exon = gtf[gtf$type=="exon",c("start","end","gene_name")]
gle = lapply(split(exon,exon$gene_name),function(x){
tmp=apply(x,1,function(y){
y[1]:y[2]
})
length(unique(unlist(tmp)))
})
gle=data.frame(gene_name=names(gle),length=as.numeric(gle))
save(gle,file = "v22_gle.Rdata")
}
load("v22_gle.Rdata");head(gle)
# merge -------------------------------------------------------------------
le = gle[match(rownames(exp),gle$gene_name),"length"]
#这个函数是现成的。
countToTpm <- function(counts, effLen)
{
rate <- log(counts) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
exp2 <- apply(exp, 2, as.numeric);rownames(exp2) <- rownames(exp)
tpms <- apply(exp2,2,countToTpm,le)
tpms <- as.data.frame(tpms)
tpms <- rownames_to_column(tpms) #由于CIBERSORT.R读取文件的代码比较粗暴,为了适应它,导出文件之前需要把行名变成一列。不然后面就会有报错。
write.table(tpms,file = "./tpms_Matrix.txt",quote = F,sep = "\t",row.names = F)
# run CIBERSORT -----------------------------------------------------------
source('./source_CIBERSORT.R')
lm22<- read.csv(file = "LM22_sheet1.csv")
write.table(lm22,file = "LM22_sheet1.txt",sep = "\t",quote = F,row.names = F)
results <- CIBERSORT("LM22_sheet1.txt","tpms_Matrix.txt", perm = 1000, QN = TRUE)
save(results,file = '11_CIBERSORT_result.Rdata')