去除RNA-seq数据批次效应

qq_27390023

已于 2022-03-06 11:00:00 修改

阅读量7.2k

点赞数 2

文章标签： r语言生物信息学

于 2022-03-06 10:58:54 首次发布

本文链接：https://blog.csdn.net/qq_27390023/article/details/123306912

版权

该文介绍了如何处理RNA-seq数据中的批次效应，首先通过DESeq2和PCA分析识别批次效应，然后使用sva包的ComBat函数及DESeq2自身的调整来消除批次效应，最后探讨了limma包的removeBatchEffect方法。通过这些方法，可以确保后续差异表达分析的准确性，减少假阳性结果。

摘要由CSDN通过智能技术生成

整合不同的表达谱数据时，往往需要查看和去除批次效用。可以根据样本的信息：批次，建库方法、测序方法等，作图查看这些因素是否有影响。去除 batch effect 后再研究不同处理下的差异表达基因，可以减少假阳性。

1. 载入有批次效应的数据

rm(list=ls())
setwd("/Users/xxx/scripts/R_scripts")

## 1. 载入有批次效应的数据
# if (!require("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("pasilla",force= TRUE)

library(pasilla)

dataFilesDir = system.file("extdata", package = "pasilla", mustWork=TRUE)
pasillaSampleAnno = read.csv(file.path(dataFilesDir, "pasilla_sample_annotation.csv"))
# pasillaSampleAnno

count_df <- read.table(file.path(dataFilesDir,"pasilla_gene_counts.tsv"),
                           sep="\t",header = TRUE, row.names=1)
count_matrix <- as.matrix(count_df)

#样品信息数据框，行名为表达矩阵的列名
colData = data.frame(
  condition = as.factor(c(rep("untreated",4),rep("treated",3))),
  type = as.factor(c("SR","SR","PE","PE","SR","PE","PE")))
rownames(colData) <- colnames(count_matrix)

#dir(system.file("extdata", package = "pasilla", mustWork=TRUE), pattern = ".txt$")
#gffFile = file.path(dataFilesDir, "Dmel.BDGP5.25.62.DEXSeq.chr.gff")

# library("DEXSeq")
# dxd = DEXSeqDataSetFromHTSeq(
#   countfiles = file.path(dataFilesDir, paste(pasillaSampleAnno$file, "txt", sep=".")), 
#   sampleData = pasillaSampleAnno,
#   design= ~ sample + exon + condition:exon,
#   flattenedfile = gffFile)
# dxd

2. 可视化查看批次效应

## 2. 可视化查看批次效应
require(DESeq2)
library(ggplot2)
dds = DESeqDataSetFromMatrix(countData = count_matrix,
                             colData = colData,
                             design= ~ condition+type)

pcaData <- plotPCA(DESeqTransform(dds), intgroup=c("condition", "type"), 
                   returnData=TRUE)

# plotPCA {DESeq2}
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition, shape = type)) +
  geom_point(size=3) +
  # xlim(-4e+05, 5e+05) +
  # ylim(-4e+05, 2e+05) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  geom_text(aes(label=name),vjust=2)

#ggsave("myPCAWithBatchEffect.png")

3.使用sva包的ComBat 函数去除批次效应

## 3.使用sva包的ComBat 函数去除批次效应
library(sva)
count_mat <- ComBat(dat=count_matrix, batch=colData$type)
# ComBat:Adjust for batch effects using an empirical Bayes framework
count_mat[1:3,1:5]

4. DESeq2包消除批次效应方法

## 4. DESeq2包消除批次效应
# design 设计矩阵中加入引起批次效应的因素(SR or PE)
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = count_matrix,
                              colData = colData,
                              design = ~ condition+ type)
dds <- DESeq(dds)
resultsNames(dds)
res <- results(dds, name="condition_untreated_vs_treated")

summary(res)
head(res)
resOdered <- res[order(res$padj),]
deg <- as.data.frame(resOdered)
#deg <- na.omit(deg)
dim(deg)
#write.csv(deg,file= "diff_deseq2.csv")

# 看看批次效应导致的差异表达基因,
# 这里是测序方法SR(single-end)和PE (paired-end)
batch_res <-  results(dds, name="type_SR_vs_PE")
summary(batch_res)
head(batch_res)
batchResOdered <- res[order(batch_res$padj),]
batchDeg <- as.data.frame(batchResOdered)
batchDeg <- na.omit(batchDeg)
dim(batchDeg)
head(batchDeg)

5. 用limma的removeBatchEffect去除批次效应

## 5. 用limma的removeBatchEffect去除批次效应
 ### 不推荐
require(limma)
require(edgeR)  # DGEList来自edgeR包

# design <- model.matrix(~0+colData$condition)
# expr_mat<- removeBatchEffect(count_matrix, batch=colData$type, 
#                   design= design)

expr_mat<- removeBatchEffect(count_matrix, batch=colData$type) 
# table(expr_mat<0) #会出现负数！
expr_mat[which(expr_mat<0)]=0 # 不知是否合理！

expr_mat <- round(expr_mat) # read 为整数

# removeBatchEffect:This function is not intended to be used 
# prior to linear modelling. For linear modelling, 
# it is better to include the batch factors in the linear model.

# 查看去掉批次效应之后的数据
count_matrix[1:3,1:5]
expr_mat[1:3,1:5]

# 接着，查看数据PCA的分布是否消除了批次效应，
# 再用去除批次效应的expr_mat做下游分析
# dds = DESeqDataSetFromMatrix(countData = expr_mat,
#                              colData = colData,
#                              design= ~ condition +type)
# 
# pcaData <- plotPCA(DESeqTransform(dds), intgroup=c("condition", "type"), 
#                    returnData=TRUE)
# 
# # plotPCA {DESeq2}
# percentVar <- round(100 * attr(pcaData, "percentVar"))
# ggplot(pcaData, aes(PC1, PC2, color=condition, shape = type)) +
#   geom_point(size=3) +
#   # xlim(-4e+05, 5e+05) +
#   # ylim(-4e+05, 2e+05) +
#   xlab(paste0("PC1: ",percentVar[1],"% variance")) +
#   ylab(paste0("PC2: ",percentVar[2],"% variance")) +
#   geom_text(aes(label=name),vjust=2)