1. 数据集的加载
# ### batchelor ##
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("batchelor")
library(batchelor)
library(scRNAseq)
sce1 <- ZeiselBrainData()
sce2 <- TasicBrainData()
library(scuttle)
# 过滤,取子集
sce1 <- addPerCellQC(sce1, subsets=list(Mito=grep("mt-", rownames(sce1))))
qc1 <- quickPerCellQC(colData(sce1), sub.fields="subsets_Mito_percent")
sce1 <- sce1[,!qc1$discard]
sce2 <- addPerCellQC(sce2, subsets=list(Mito=grep("mt_", rownames(sce2))))
qc2 <- quickPerCellQC(colData(sce2), sub.fields="subsets_Mito_percent")
sce2 <- sce2[,!qc2$discard]
# 行:共同的基因,取子集
universe <- intersect(rownames(sce1), rownames(sce2))
sce1 <- sce1[universe,]
sce2 <- sce2[universe,]
2. 消除批间测序覆盖度(coverage)差异
# Per-batch scaling normalization
# to remove systematic differences in coverage across batches
# to simplify downstream comparisons.
out <- multiBatchNorm(sce1, sce2)
sce1 <- out[[1]]
sce2 <- out[[2]]
3. 差异表达基因分析
找出差异大的基因,用于下游的批间效应校正。
library(scran)
# modelGeneVar {scran}:Model the per-gene variance
dec1 <- modelGeneVar(sce1)
dec2 <- modelGeneVar(sce2)
combined.dec <- combineVar(dec1, dec2)
# 基因表达差异从大到小排序,选择前n个。
chosen.hvgs <- getTopHVGs(combined.dec, n=5000)
4. 批次差异校正
Usage
correctExperiments( ..., batch = NULL, restrict = NULL, subset.row = NULL, correct.all = FALSE, assay.type = "logcounts", PARAM = FastMnnParam(), combine.assays = NULL, combine.coldata = NULL, include.rowdata = TRUE, add.single = TRUE )
combined <- correctExperiments(A=sce1, B=sce2, PARAM=FastMnnParam ())
# 等同于
combined <- fastMNN(A=sce1, B=sce2)
### 其他方法
# Using fewer genes as it is much, much slower.
fewer.hvgs <- head(order(combined.dec$bio, decreasing=TRUE), 100)
classic.out <- correctExperiments(A=sce1, B=sce2,subset.row=fewer.hvgs,
PARAM=ClassicMnnParam ())
classic.out <- mnnCorrect(sce1, sce2, subset.row=fewer.hvgs)
rescale.out <- correctExperiments(A=sce1, B=sce2, PARAM=RescaleParam ())
rescale.out <- rescaleBatches(sce1, sce2)
regress.out <- correctExperiments(A=sce1, B=sce2, PARAM=RegressParam ())
regress.out <- regressBatches(sce1, sce2)
## assay(combined)[1:2,1:3] 不同于assay(sce1)[1:2,1:3]以及
# logcounts(sce1)[1:2,1:3],已经去除了批次效应
# PARAM:A BatchelorParam object specifying the batch correction method
# to dispatch to, and the parameters with which it should be run.
# ClassicMnnParam will dispatch to mnnCorrect;
# FastMnnParam will dispatch to fastMNN;
# RescaleParam will dispatch to rescaleBatches;
# and RegressParam will dispatch to regressBatches.
5. 表达谱数据降维可视化
library(scater)
set.seed(100)
# 没有去除批次效应的数据,可视化展示
combined <- runPCA(combined, subset_row=chosen.hvgs)
combined <- runTSNE(combined, dimred="PCA")
plotPCA(combined, colour_by="batch")
plotTSNE(combined, colour_by="batch")
# 去除批次效应后的数据可视化
# dim(reducedDim(combined, "corrected"))
combined <- runTSNE(combined, dimred="corrected")
plotTSNE(combined, colour_by="batch")
set.seed(101)
f.out <- fastMNN(A=sce1, B=sce2, subset.row=chosen.hvgs)
f.out3 <- fastMNN(A=sce1, B=sce2)
str(reducedDim(f.out, "corrected")) # 命名
rle(f.out$batch)
#table(f.out$batch)
set.seed(103)
f.out <- runTSNE(f.out, dimred="corrected")
plotTSNE(f.out, colour_by="batch")
5. 查看消除批次效应后的表达量数据
# 查看校正后基因的表达量
cor.exp <- assay(f.out)[1,]
hist(cor.exp, xlab="Corrected expression for gene 1", col="grey80")
assay(combined)
# counts(combined)[2,1:2874] # 同 counts(sce1)[2,]
# logcounts(combined)[2,1:2874] # 同 logcounts(sce1)[2,]