转录组差异分析代码DESeq2+edgeR+limma(voom)

awk_bioinfo

已于 2023-07-21 09:15:25 修改

阅读量964

点赞数 1

文章标签：生物信息

于 2023-05-09 13:53:34 首次发布

本文链接：https://blog.csdn.net/qq_36608036/article/details/130578558

版权

1. DESeq2

DESeq2是目前最常用的差异分析R包。除了可以导入counts外，如果上游使用salmon，DESeq2官方还给出了直接导入tximport生成的txi对象的方法。

library(DESeq2)
library("BiocParallel") #启用多核计算
readcount = 'All_sample_counts.xls'
database_all <- read.table(file = readcount, sep = "\t", header = T, row.names = 1)
database <- database_all[,c(1:3,7:9)]
#type <- factor(c(rep("LC_1",3), rep("LC_2",3)))
database <- round(as.matrix(database))

condition <- factor(c(rep("LPS",3), rep("PBS",3)))
coldata <- data.frame(row.names = colnames(database), condition)
dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
# 指定哪一组作为对照组
dds$condition <- relevel(dds$condition, ref = "PBS")

# 使用DESeq函数进行估计离散度，然后进行标准的差异表达分析，得到res对象结果
dds <- DESeq(dds)
res0 <- results(dds)
res = na.omit(res0)
res <- res[order(res$pvalue),]
head(res)
resdata = data.frame(Gene = rownames(res),logFC = res$log2FoldChange,logCPM = res$lfcSE,PValue=res$pvalue,FDR=res$padj)
# resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
write.table(resdata,file = "All_sample_counts.xls.LPS_vs_PBS.DESeq2.DE_results",row.names = F,quote = F,sep = '\t')

2. edgeR

使用EdgeR时注意选择合适的分析算法，官方建议bulk RNA-seq选择quasi-likelihood(QL) F-test tests，scRNA-seq 或是没有重复样品的数据选用 likelihood ratio test。

library(edgeR)
data = read.table(readcount, header=T, row.names=1, com='')
col_ordering = c(1:3,7:9)
rnaseqMatrix = data[,col_ordering]
rnaseqMatrix = round(rnaseqMatrix)
rnaseqMatrix = rnaseqMatrix[rowSums(cpm(rnaseqMatrix) > 1) >= 2,]
conditions = factor(c(rep("LPS", 3), rep("PBS", 3)))

exp_study = DGEList(counts=rnaseqMatrix, group=conditions)
exp_study = calcNormFactors(exp_study)
exp_study = estimateCommonDisp(exp_study)
exp_study = estimateTagwiseDisp(exp_study)
et = exactTest(exp_study, pair=c("LPS", "PBS"))
tTags = topTags(et,n=NULL)
result_table = tTags$table
result_table = data.frame(sampleA="LPS", sampleB="PBS", result_table)
result_table$logFC = -1 * result_table$logFC
Gene <- rownames(result_table)
result_table = cbind(Gene,as.data.frame(result_table[,c(3,4,5,6)]))
write.table(result_table, file='All_sample_counts.xls.LPS_vs_PBS.edgeR.DE_results', sep='	', quote=F, row.names=F)

3. limma

limma进行差异分析有两种方法：limma-trend和voom，在样本测序深度相差不大时两种方法差距不大，而测序深度相差大时voom更有优势，因此我们一般都选择voom的方法进行差异分析。Limma-voom vs limma-trend (bioconductor.org)

library(limma)
library(edgeR)

#分组矩阵design构建
design <- model.matrix(~0+factor(group_list)) #构建分组矩阵
colnames(design) <- levels(factor(group_list))
rownames(design) <- colnames(counts)

## 表达矩阵DGEList构建与过滤低表达基因
dge <- DGEList(counts=counts) 
keep.exprs <- filterByExpr(dge,design=design) #过滤低表达基因
dge <- dge[keep.exprs,,keep.lib.sizes=FALSE] 
dge <- calcNormFactors(dge)  #归一化基因表达分布,得到的归一化系数被用作文库大小的缩放系数
cont.matrix <- makeContrasts(contrasts=paste0(exp,'-',ctr), #比对顺序实验/对照
                             levels = design)

## DE分析  limma-trend（logCPM,有相似文库大小）  or  voom（文库大小差异大）
# de <- cpm(dge, log=TRUE, prior.count=3)  #如选择logCPM，则eBayes设trend=TRUE
de <- voom(dge,design,plot=TRUE, normalize="quantile")
fit1 <- lmFit(de, design)               #线性拟合
fit2 <- contrasts.fit(fit1,cont.matrix) #统计检验
efit <- eBayes(fit2, trend=F)  #Apply empirical Bayes smoothing to the standard errors

tempDEG <- topTable(efit, coef=paste0(exp,'-',ctr), n=Inf)  #padj值从小到大排列
DEG_limma_voom  <- na.omit(tempDEG)