splatter包生成单细胞RNA测序数据

Splatter是一个模拟单细胞RNA测序计数数据的软件包。它提供了一个简单的界面,用于创建可复制且文档充分的复杂模拟。可以从真实数据估计参数,并提供用于比较真实数据集和模拟数据集的函数。

# if (!require("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("splatter")

library(splatter)
library(scater)

### 1. 快速创建模拟数据
# Usage
# mockSCE(ncells = 200, ngenes = 2000, nspikes = 100)
set.seed(1)
sce <- mockSCE() # class: SingleCellExperiment
counts(sce)

### 2. 参数估计
# 从数据(模拟数据或真实的单细胞测序数据)中估计参数
params <- splatEstimate(sce)  # Params object

### 3. 自定义参数,创建模拟数据
# 创建参数对象
params <- newSplatParams() #默认10000 Genes,100 Cells
# 查看参数对象中的参数设定
getParam(params, "nGenes")
getParam(params, "nCells")
getParams(params, c("nGenes", "mean.rate", "mean.shape"))
# 设定参数对象中的参数
params <- setParam(params, "nGenes", 5000)
params <- setParam(params, "nCells", 50)
params <- setParams(params, mean.shape = 0.5, de.prob = 0.2)
params <- setParams(params, update = list(nGenes = 8000, mean.rate = 0.5))

# 根据参数,创建模拟数据
sim <- splatSimulate(params)

### 4. 提取SingleCellExperiment类信息
class(sim)
# Information about genes
head(rowData(sim))
# Information about cells
head(colData(sim))
# Gene by cell matrices
names(assays(sim))
# Example of cell means matrix
assays(sim)$CellMeans[1:5, 1:5]
# 表达矩阵
counts <- counts(sim)
counts[1:3,1:5]

class(counts) # "matrix" "array" 
typeof(counts)  # [1] "integer"
dim(counts) #[1] 8000  100

### 5. 数据标准化,对数化和降维
## {scuttle}包的函数:logNormCounts
## {scater}包的函数:runPCA,plotPCA
#  SingleCellExperiment 对象的counts值计算对数转换的归一化表达式值
sim <- logNormCounts(sim)
counts(sim)[1:3,1:5]
logcounts(sim)[1:3,1:5]
# PCA降维
sim <- runPCA(sim)
plotPCA(sim)

### 6. 创建复杂的模拟数据
## groups
sim.groups <- splatSimulate(group.prob = c(0.3, 0.7), method = "groups",
                            verbose = FALSE)
sim.groups <- logNormCounts(sim.groups)
sim.groups <- runPCA(sim.groups)
plotPCA(sim.groups, colour_by = "Group")

dim(counts(sim.groups))
rowData(sim.groups)$DEFacGroup1
rowData(sim.groups)$DEFacGroup2
metadata(sim.groups)

## paths
sim.paths <- splatSimulate(de.prob = 0.2, nGenes = 1000, method = "paths",
                           verbose = FALSE)
sim.paths <- logNormCounts(sim.paths)
sim.paths <- runPCA(sim.paths)
plotPCA(sim.paths, colour_by = "Step")
colData(sim.paths)$Step

## batches
sim.batches <- splatSimulate(batchCells = c(50, 50), verbose = FALSE)
sim.batches <- logNormCounts(sim.batches)
sim.batches <- runPCA(sim.batches)
plotPCA(sim.batches, colour_by = "Batch")
rowData(sim.batches)$BatchFacBatch1
rowData(sim.batches)$BatchFacBatch2

### 7.比较SingleCellExperiment对象
## 相互比较
sim1 <- splatSimulate(nGenes = 1000, batchCells = 20, verbose = FALSE)
sim2 <- simpleSimulate(nGenes = 1000, nCells = 20, verbose = FALSE)
comparison <- compareSCEs(list(Splat = sim1, Simple = sim2))

names(comparison)
comparison$RowData
comparison$ColData

names(comparison$Plots)
comparison$Plots$Means
comparison$Plots$LibrarySizes
comparison$Plots$ZerosCell

## 与参考进行比较
sim1 <- splatSimulate(nGenes = 1000, batchCells = 100, verbose = FALSE)
sim2 <- splatSimulate(nGenes = 1000, batchCells = c(40, 60), verbose = FALSE)
sim3 <- simpleSimulate(nGenes = 1000, nCells = 100, verbose = FALSE)
difference <- diffSCEs(list(Splat1 = sim1, Splat2 = sim2, Simple = sim3), ref = "Simple")
difference$Plots$Means
difference$QQPlots$Means

### 8.批次效应参数设置
library("splatter")
library("scater")
library("ggplot2")

# Simulation with small batch effects
sim1 <- splatSimulate(params, batchCells = c(100, 100),
                      batch.facLoc = 0.001, batch.facScale = 0.001,
                      verbose = FALSE)
sim1 <- logNormCounts(sim1)
sim1 <- runPCA(sim1)
plotPCA(sim1, colour_by = "Batch") + ggtitle("Small batch effects")

# Simulation with big batch effects
sim2 <- splatSimulate(params, batchCells = c(100, 100),
                      batch.facLoc = 0.5, batch.facScale = 0.5,
                      verbose = FALSE)
sim2 <- logNormCounts(sim2)
sim2 <- runPCA(sim2)
plotPCA(sim2, colour_by = "Batch") + ggtitle("Big batch effects")

# 是否消除批次效应设置
sim1 <- splatSimulate(params, batchCells = c(100, 100), batch.rmEffect = FALSE,
verbose = FALSE)
sim1 <- logNormCounts(sim1)
sim1 <- runPCA(sim1)
plotPCA(sim1, colour_by = "Batch") + ggtitle("With batch effects")

sim2 <- splatSimulate(params, batchCells = c(100, 100), batch.rmEffect = TRUE,
                      verbose = FALSE)
sim2 <- logNormCounts(sim2)
sim2 <- runPCA(sim2)
plotPCA(sim2, colour_by = "Batch") + ggtitle("Batch effects removed")

### 9. 离群值参数设置
## ----outlier-prob-------------------------------------------------------------
# Few outliers
sim1 <- splatSimulate(out.prob = 0.001, verbose = FALSE)
ggplot(as.data.frame(rowData(sim1)),
       aes(x = log10(GeneMean), fill = OutlierFactor != 1)) +
  geom_histogram(bins = 100) +
  ggtitle("Few outliers")

# Lots of outliers
sim2 <- splatSimulate(out.prob = 0.2, verbose = FALSE)
ggplot(as.data.frame(rowData(sim2)),
       aes(x = log10(GeneMean), fill = OutlierFactor != 1)) +
  geom_histogram(bins = 100) +
  ggtitle("Lots of outliers")

### 10.设置组参数
# One small group, one big group
params.groups <- newSplatParams(batchCells = 500, nGenes = 1000)
sim1 <- splatSimulateGroups(params.groups, group.prob = c(0.9, 0.1),
                            verbose = FALSE)
sim1 <- logNormCounts(sim1)
sim1 <- runPCA(sim1)
plotPCA(sim1, colour_by = "Group") + ggtitle("One small group, one big group")

# Five groups
sim2 <- splatSimulateGroups(params.groups,
                            group.prob = c(0.2, 0.2, 0.2, 0.2, 0.2),
                            verbose = FALSE)
sim2 <- logNormCounts(sim2)
sim2 <- runPCA(sim2)
plotPCA(sim2, colour_by = "Group") + ggtitle("Five groups")

### 11. 设置差异表达基因参数
## de.prob 参数
# Few DE genes
params.groups <- newSplatParams(batchCells = 500, nGenes = 1000)
sim1 <- splatSimulateGroups(params.groups, group.prob = c(0.5, 0.5),
                            de.prob = 0.01, verbose = FALSE)
sim1 <- logNormCounts(sim1)
sim1 <- runPCA(sim1)
plotPCA(sim1, colour_by = "Group") + ggtitle("Few DE genes")

# Lots of DE genes
sim2 <- splatSimulateGroups(params.groups, group.prob = c(0.5, 0.5),
                            de.prob = 0.3, verbose = FALSE)
sim2 <- logNormCounts(sim2)
sim2 <- runPCA(sim2)
plotPCA(sim2, colour_by = "Group") + ggtitle("Lots of DE genes")

## de.facLoc参数
# Small DE factors
sim1 <- splatSimulateGroups(params.groups, group.prob = c(0.5, 0.5),
                            de.facLoc = 0.01, verbose = FALSE)
sim1 <- logNormCounts(sim1)
sim1 <- runPCA(sim1)
plotPCA(sim1, colour_by = "Group") + ggtitle("Small DE factors")

# Big DE factors
sim2 <- splatSimulateGroups(params.groups, group.prob = c(0.5, 0.5),
                            de.facLoc = 0.3, verbose = FALSE)
sim2 <- logNormCounts(sim2)
sim2 <- runPCA(sim2)
plotPCA(sim2, colour_by = "Group") + ggtitle("Big DE factors")

## complex-de---------------------------------------------------------------
# Different DE probs
sim1 <- splatSimulateGroups(params.groups,
                            group.prob = c(0.2, 0.2, 0.2, 0.2, 0.2),
                            de.prob = c(0.01, 0.01, 0.1, 0.1, 0.3),
                            verbose = FALSE)
sim1 <- logNormCounts(sim1)
sim1 <- runPCA(sim1)
plotPCA(sim1, colour_by = "Group") +
  labs(title = "Different DE probabilities",
       caption = paste("Groups 1 and 2 have very few DE genes,",
                       "Groups 3 and 4 have the default number,",
                       "Group 5 has many DE genes"))


# Different DE factors
sim2 <- splatSimulateGroups(params.groups,
                            group.prob = c(0.2, 0.2, 0.2, 0.2, 0.2),
                            de.facLoc = c(0.01, 0.01, 0.1, 0.1, 0.2),
                            de.facScale = c(0.2, 0.5, 0.2, 0.5, 0.4),
                            verbose = FALSE)
sim2 <- logNormCounts(sim2)
sim2 <- runPCA(sim2)
plotPCA(sim2, colour_by = "Group") +
  labs(title = "Different DE factors",
       caption = paste("Group 1 has factors with small location (value),",
                       "and scale (variability),",
                       "Group 2 has small location and greater scale.\n",
                       "Groups 3 and 4 have greater location with small,",
                       "and large scales",
                       "Group 5 has bigger factors with moderate",
                       "variability"))

# Combination of everything
sim3 <- splatSimulateGroups(params.groups,
                            group.prob = c(0.05, 0.2, 0.2, 0.2, 0.35),
                            de.prob = c(0.3, 0.1, 0.2, 0.01, 0.1),
                            de.downProb = c(0.1, 0.4, 0.9, 0.6, 0.5),
                            de.facLoc = c(0.6, 0.1, 0.1, 0.01, 0.2),
                            de.facScale = c(0.1, 0.4, 0.2, 0.5, 0.4),
                            verbose = FALSE)
sim3 <- logNormCounts(sim3)
sim3 <- runPCA(sim3)
plotPCA(sim3, colour_by = "Group") +
  labs(title = "Different DE factors",
       caption = paste(
         "Group 1 is small with many very up-regulated DE genes,",
         "Group 2 has the default DE parameters,\n",
         "Group 3 has many down-regulated DE genes,",
         "Group 4 has very few DE genes,",
         "Group 5 is large with moderate DE factors")
  )

### 12. path参数设置
# Linear paths
params.groups <- newSplatParams(batchCells = 500, nGenes = 1000)
sim1 <- splatSimulatePaths(params.groups,
                           group.prob = c(0.25, 0.25, 0.25, 0.25),
                           de.prob = 0.5, de.facLoc = 0.2,
                           path.from = c(0, 1, 2, 3),
                           verbose = FALSE)
sim1 <- logNormCounts(sim1)
sim1 <- runPCA(sim1)
plotPCA(sim1, colour_by = "Group") + ggtitle("Linear paths")

# Branching path
sim2 <- splatSimulatePaths(params.groups,
                           group.prob = c(0.25, 0.25, 0.25, 0.25),
                           de.prob = 0.5, de.facLoc = 0.2,
                           path.from = c(0, 1, 1, 3),
                           verbose = FALSE)
sim2 <- logNormCounts(sim2)
sim2 <- runPCA(sim2)
plotPCA(sim2, colour_by = "Group") + ggtitle("Branching path")

## ----paths-steps------------------------
# Few steps
sim1 <- splatSimulatePaths(params.groups, path.nSteps = 3,
                           de.prob = 0.5, de.facLoc = 0.2, verbose = FALSE)
sim1 <- logNormCounts(sim1)
sim1 <- runPCA(sim1)
plotPCA(sim1, colour_by = "Step") + ggtitle("Few steps")

# Lots of steps
sim2 <- splatSimulatePaths(params.groups, path.nSteps = 1000,
                           de.prob = 0.5, de.facLoc = 0.2, verbose = FALSE)
sim2 <- logNormCounts(sim2)
sim2 <- runPCA(sim2)
plotPCA(sim2, colour_by = "Step") + ggtitle("Lots of steps")

## ----paths-skew-------------------
# Skew towards the end
sim1 <- splatSimulatePaths(params.groups, path.skew = 0.1,
                           de.prob = 0.5, de.facLoc = 0.2, verbose = FALSE)
sim1 <- logNormCounts(sim1)
sim1 <- runPCA(sim1)
plotPCA(sim1, colour_by = "Step") + ggtitle("Skewed towards the end")

参考

https://www.bioconductor.org/packages/release/bioc/html/splatter.html

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5596896/

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值