TCGA的差异分析（limma和edgeR）

最新推荐文章于 2024-09-16 12:34:16 发布

一条兔子

最新推荐文章于 2024-09-16 12:34:16 发布

阅读量1.5w

点赞数 10

分类专栏：生信分析

本文链接：https://blog.csdn.net/tu__zi/article/details/105617657

版权

生信分析专栏收录该内容

4 篇文章 7 订阅

订阅专栏

TCGA差异分析

文章目录

TCGA差异分析
@[toc]
使用到的包
读入数据和预处理
设置分组信息
limma
edgeR
参考

2020.05.08更新：原文由于当时实验赶时间，只是网上随便收集了一下方法，并不是很完善，现在做出了修改

TCGA差异分析一般不直接使用T检验（除非下载的是FPKM数据），而是下载Counts数据然后利用limma包或者edge包进行差异分析

这里记录一下limma包和edge包差异分析的使用，以防太久没用的时候忘记

使用到的包

library(limma)
library(edgeR)

读入数据和预处理

##读取数据
## 读入counts数据，不要标准化
options(stringsAsFactors = F)
normal <- read.delim("E:/daiMa/R/Drug resistance/TCGA-LIHC-N.txt", row.names=1, stringsAsFactors=FALSE)
cancer <- read.delim("E:/daiMa/R/Drug resistance/TCGA-LIHC-T.txt", row.names=1)
counts <- cbind(normal,cancer)
dim(counts)

##去重
geneid <- as.factor(row.names(counts))
counts_delrep <- apply(counts,2,function(x) tapply(x, geneid, mean))

## 读入label文件
label <- read.table("label.txt",header=TRUE,check.names = FALSE,sep='\t')

Exp <- counts_delrep
name <- colnames(counts_delrep)
name <- substr(name,1,12)
## 对标签
loc <- match(name,label[,1])
na_loc <- which(is.na(loc))
##去掉对不上标签的
loc <- loc[-na_loc]
Exp <- Exp[,-na_loc]     ## 去掉没有对上标签的样本
Label <- label[loc,2]
length(Label)
dim(Exp)

设置分组信息

## 设置分组信息
group <- as.factor(Label)
design <- model.matrix(~0+group)
row.names(design) <- colnames(Exp)

上面的部分edgeR和limma包都是一样的，下面的标准化和找差异就不一样了

limma

DGElist <- DGEList( counts = Exp, group = group )
## 过滤掉cpm小于等于1的基因
keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2 
DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]

## 标准化
DGElist <- calcNormFactors( DGElist )

## 将数据转化成logCMP，并且画出拟合的线性模型
v <- voom(DGElist, design, plot = TRUE, normalize = "quantile")
fit <- lmFit(v, design)

这里的**‘FEMALE-MALE’** 根据自己的标签进行修改

cont.matrix <- makeContrasts(contrasts = c('FEMALE-MALE'), levels = design)

fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)

nrDEG_limma_voom = topTable(fit2, coef = 'FEMALE-MALE', n = Inf)
nrDEG_limma_voom = na.omit(nrDEG_limma_voom)
head(nrDEG_limma_voom)

提取差异基因

FDR = 0.05 
foldChange= 2 
DEProtein = nrDEG_edgeR[(nrDEG_edgeR$FDR < FDR & (nrDEG_edgeR$logFC>foldChange | nrDEG_edgeR$logFC<(-foldChange))),]
DEProtein = DEProtein[order(DEProtein$logFC),]
dim(DEProtein)
write.csv(DEProtein,"DEprotein.csv",quote = F)

edgeR

## 把负数的表达值改成0（edgeR不允许出现负的表达值）
Exp[Exp<0] <- 0
DGElist <- DGEList(counts = Exp,group = group)
## 过滤基因
keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2

DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]
## 标准化
DGElist <- calcNormFactors( DGElist )
DGElist <- estimateGLMCommonDisp(DGElist, design)
DGElist <- estimateGLMTrendedDisp(DGElist, design)
DGElist <- estimateGLMTagwiseDisp(DGElist, design)

fit <- glmFit(DGElist, design)
results <- glmLRT(fit, contrast = c(-1, 1)) 
nrDEG_edgeR <- topTags(results, n = nrow(DGElist))
nrDEG_edgeR <- as.data.frame(nrDEG_edgeR)
head(nrDEG_edgeR)

提取差异基因

## 提取差异基因
FDR = 0.05 
foldChange= 2 
DEProtein = nrDEG_edgeR[(nrDEG_edgeR$FDR < FDR & (nrDEG_edgeR$logFC>foldChange | nrDEG_edgeR$logFC<(-foldChange))),]
DEProtein = DEProtein[order(DEProtein$logFC),]
dim(DEProtein)
write.csv(DEProtein,"DEprotein.csv",quote = F)