TCGA差异分析
2020.05.08更新:原文由于当时实验赶时间,只是网上随便收集了一下方法,并不是很完善,现在做出了修改
TCGA差异分析一般不直接使用T检验(除非下载的是FPKM数据),而是下载Counts数据然后利用limma包或者edge包进行差异分析
这里记录一下limma包和edge包差异分析的使用,以防太久没用的时候忘记
使用到的包
library(limma)
library(edgeR)
读入数据和预处理
##读取数据
## 读入counts数据,不要标准化
options(stringsAsFactors = F)
normal <- read.delim("E:/daiMa/R/Drug resistance/TCGA-LIHC-N.txt", row.names=1, stringsAsFactors=FALSE)
cancer <- read.delim("E:/daiMa/R/Drug resistance/TCGA-LIHC-T.txt", row.names=1)
counts <- cbind(normal,cancer)
dim(counts)
##去重
geneid <- as.factor(row.names(counts))
counts_delrep <- apply(counts,2,function(x) tapply(x, geneid, mean))
## 读入label文件
label <- read.table("label.txt",header=TRUE,check.names = FALSE,sep='\t')
Exp <- counts_delrep
name <- colnames(counts_delrep)
name <- substr(name,1,12)
## 对标签
loc <- match(name,label[,1])
na_loc <- which(is.na(loc))
##去掉对不上标签的
loc <- loc[-na_loc]
Exp <- Exp[,-na_loc] ## 去掉没有对上标签的样本
Label <- label[loc,2]
length(Label)
dim(Exp)
设置分组信息
## 设置分组信息
group <- as.factor(Label)
design <- model.matrix(~0+group)
row.names(design) <- colnames(Exp)
上面的部分edgeR和limma包都是一样的,下面的标准化和找差异就不一样了
limma
DGElist <- DGEList( counts = Exp, group = group )
## 过滤掉cpm小于等于1的基因
keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2
DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]
## 标准化
DGElist <- calcNormFactors( DGElist )
## 将数据转化成logCMP,并且画出拟合的线性模型
v <- voom(DGElist, design, plot = TRUE, normalize = "quantile")
fit <- lmFit(v, design)
这里的**‘FEMALE-MALE’** 根据自己的标签进行修改
cont.matrix <- makeContrasts(contrasts = c('FEMALE-MALE'), levels = design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
nrDEG_limma_voom = topTable(fit2, coef = 'FEMALE-MALE', n = Inf)
nrDEG_limma_voom = na.omit(nrDEG_limma_voom)
head(nrDEG_limma_voom)
提取差异基因
FDR = 0.05
foldChange= 2
DEProtein = nrDEG_edgeR[(nrDEG_edgeR$FDR < FDR & (nrDEG_edgeR$logFC>foldChange | nrDEG_edgeR$logFC<(-foldChange))),]
DEProtein = DEProtein[order(DEProtein$logFC),]
dim(DEProtein)
write.csv(DEProtein,"DEprotein.csv",quote = F)
edgeR
## 把负数的表达值改成0(edgeR不允许出现负的表达值)
Exp[Exp<0] <- 0
DGElist <- DGEList(counts = Exp,group = group)
## 过滤基因
keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2
DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]
## 标准化
DGElist <- calcNormFactors( DGElist )
DGElist <- estimateGLMCommonDisp(DGElist, design)
DGElist <- estimateGLMTrendedDisp(DGElist, design)
DGElist <- estimateGLMTagwiseDisp(DGElist, design)
fit <- glmFit(DGElist, design)
results <- glmLRT(fit, contrast = c(-1, 1))
nrDEG_edgeR <- topTags(results, n = nrow(DGElist))
nrDEG_edgeR <- as.data.frame(nrDEG_edgeR)
head(nrDEG_edgeR)
提取差异基因
## 提取差异基因
FDR = 0.05
foldChange= 2
DEProtein = nrDEG_edgeR[(nrDEG_edgeR$FDR < FDR & (nrDEG_edgeR$logFC>foldChange | nrDEG_edgeR$logFC<(-foldChange))),]
DEProtein = DEProtein[order(DEProtein$logFC),]
dim(DEProtein)
write.csv(DEProtein,"DEprotein.csv",quote = F)