整合不同的表达谱数据时,往往需要查看和去除批次效用。可以根据样本的信息:批次,建库方法、测序方法等,作图查看这些因素是否有影响。去除 batch effect 后再研究不同处理下的差异表达基因,可以减少假阳性。
1. 载入有批次效应的数据
rm(list=ls())
setwd("/Users/xxx/scripts/R_scripts")
## 1. 载入有批次效应的数据
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("pasilla",force= TRUE)
library(pasilla)
dataFilesDir = system.file("extdata", package = "pasilla", mustWork=TRUE)
pasillaSampleAnno = read.csv(file.path(dataFilesDir, "pasilla_sample_annotation.csv"))
# pasillaSampleAnno
count_df <- read.table(file.path(dataFilesDir,"pasilla_gene_counts.tsv"),
sep="\t",header = TRUE, row.names=1)
count_matrix <- as.matrix(count_df)
#样品信息数据框,行名为表达矩阵的列名
colData = data.frame(
condition = as.factor(c(rep("untreated",4),rep("treated",3))),
type = as.factor(c("SR","SR","PE","PE","SR","PE","PE")))
rownames(colData) <- colnames(count_matrix)
#dir(system.file("extdata", package = "pasilla", mustWork=TRUE), pattern = ".txt$")
#gffFile = file.path(dataFilesDir, "Dmel.BDGP5.25.62.DEXSeq.chr.gff")
# library("DEXSeq")
# dxd = DEXSeqDataSetFromHTSeq(
# countfiles = file.path(dataFilesDir, paste(pasillaSampleAnno$file, "txt", sep=".")),
# sampleData = pasillaSampleAnno,
# design= ~ sample + exon + condition:exon,
# flattenedfile = gffFile)
# dxd
2. 可视化查看批次效应
## 2. 可视化查看批次效应
require(DESeq2)
library(ggplot2)
dds = DESeqDataSetFromMatrix(countData = count_matrix,
colData = colData,
design= ~ condition+type)
pcaData <- plotPCA(DESeqTransform(dds), intgroup=c("condition", "type"),
returnData=TRUE)
# plotPCA {DESeq2}
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition, shape = type)) +
geom_point(size=3) +
# xlim(-4e+05, 5e+05) +
# ylim(-4e+05, 2e+05) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
geom_text(aes(label=name),vjust=2)
#ggsave("myPCAWithBatchEffect.png")
3.使用sva包的ComBat 函数去除批次效应
## 3.使用sva包的ComBat 函数去除批次效应
library(sva)
count_mat <- ComBat(dat=count_matrix, batch=colData$type)
# ComBat:Adjust for batch effects using an empirical Bayes framework
count_mat[1:3,1:5]
4. DESeq2包消除批次效应方法
## 4. DESeq2包消除批次效应
# design 设计矩阵中加入引起批次效应的因素(SR or PE)
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = count_matrix,
colData = colData,
design = ~ condition+ type)
dds <- DESeq(dds)
resultsNames(dds)
res <- results(dds, name="condition_untreated_vs_treated")
summary(res)
head(res)
resOdered <- res[order(res$padj),]
deg <- as.data.frame(resOdered)
#deg <- na.omit(deg)
dim(deg)
#write.csv(deg,file= "diff_deseq2.csv")
# 看看批次效应导致的差异表达基因,
# 这里是测序方法SR(single-end)和PE (paired-end)
batch_res <- results(dds, name="type_SR_vs_PE")
summary(batch_res)
head(batch_res)
batchResOdered <- res[order(batch_res$padj),]
batchDeg <- as.data.frame(batchResOdered)
batchDeg <- na.omit(batchDeg)
dim(batchDeg)
head(batchDeg)
5. 用limma的removeBatchEffect去除批次效应
## 5. 用limma的removeBatchEffect去除批次效应
### 不推荐
require(limma)
require(edgeR) # DGEList来自edgeR包
# design <- model.matrix(~0+colData$condition)
# expr_mat<- removeBatchEffect(count_matrix, batch=colData$type,
# design= design)
expr_mat<- removeBatchEffect(count_matrix, batch=colData$type)
# table(expr_mat<0) #会出现负数!
expr_mat[which(expr_mat<0)]=0 # 不知是否合理!
expr_mat <- round(expr_mat) # read 为整数
# removeBatchEffect:This function is not intended to be used
# prior to linear modelling. For linear modelling,
# it is better to include the batch factors in the linear model.
# 查看去掉批次效应之后的数据
count_matrix[1:3,1:5]
expr_mat[1:3,1:5]
# 接着,查看数据PCA的分布是否消除了批次效应,
# 再用去除批次效应的expr_mat做下游分析
# dds = DESeqDataSetFromMatrix(countData = expr_mat,
# colData = colData,
# design= ~ condition +type)
#
# pcaData <- plotPCA(DESeqTransform(dds), intgroup=c("condition", "type"),
# returnData=TRUE)
#
# # plotPCA {DESeq2}
# percentVar <- round(100 * attr(pcaData, "percentVar"))
# ggplot(pcaData, aes(PC1, PC2, color=condition, shape = type)) +
# geom_point(size=3) +
# # xlim(-4e+05, 5e+05) +
# # ylim(-4e+05, 2e+05) +
# xlab(paste0("PC1: ",percentVar[1],"% variance")) +
# ylab(paste0("PC2: ",percentVar[2],"% variance")) +
# geom_text(aes(label=name),vjust=2)