scuttle软件包为单细胞组学数据分析提供了一些实用功能。这包括一些计算和过滤质量控制的简单方法;基础数据转换涉及各种类型的缩放规范化;以及跨单元或功能的灵活聚合。
1. 载入SingleCellExperiment数据
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("scuttle")
library(scRNAseq)
sce <- ZeiselBrainData()
sce
2. 增加PerCellQCMetrics数据到colData
library(scuttle)
is.mito <- grep("mt-", rownames(sce))
##方法1
# subsets:A named list containing one or more vectors (a character vector
# of feature names, a logical vector,
# or a numeric vector of indices), used to identify interesting feature subsets
# such as ERCC spike-in transcripts or mitochondrial genes.
per.cell <- perCellQCMetrics(sce, subsets=list(Mito=is.mito))
colnames(per.cell)
per.cell[1:3,]
# 过滤
#discard <- perCellQCFilters(per.cell)
#filtered_sce <- sce[,!discard$discard]
# 总counts
summary(per.cell$sum)
# 检测到的基因数量
summary(per.cell$detected)
# sum(assay(sce)[,"1772071015_C02"]>0)
summary(per.cell$subsets_Mito_percent)
summary(per.cell$altexps_ERCC_percent)
colData(sce) <- cbind(colData(sce), per.cell)
colnames(colData(sce))
## 方法2
sce <- addPerCellQCMetrics(sce, subsets=list(Mito=is.mito))
colnames(colData(sce))
# sce <- addPerCellQCMetrics(sce)
# colnames(colData(sce))
3. 根据CellQC过滤
## 方法1
# An example with just mitochondrial filters.
qc.stats <- perCellQCFilters(per.cell, sub.fields="subsets_Mito_percent")
colSums(as.matrix(qc.stats))
# Another example with mitochondrial + spike-in filters.
qc.stats2 <- perCellQCFilters(per.cell,
sub.fields=c("subsets_Mito_percent",
"altexps_ERCC_percent"))
colSums(as.matrix(qc.stats2))
# 过滤
filtered_sce2 <- sce[,!qc.stats2$discard]
# 方法2
filtered_sce3 <- quickPerCellQC(sce, subsets=list(Mito=is.mito),
sub.fields=c("subsets_Mito_percent",
"altexps_ERCC_percent"))
4 . 过滤outliers
# Convenience function to determine which values in a numeric vector are
# outliers based on the median absolute deviation (MAD).
# 是否有低counts数(outlier)的单细胞
low.total <- isOutlier(per.cell$sum, type="lower", log=TRUE)
summary(low.total)
attr(low.total, "threshold")
### 去除总counts数outlier的细胞
sce[,!low.total]
# 按tissue分别计算,log2 转化后有离群值。
# 按tissue分别计算,不进行log2 转化,没有离群值。
# 不按tissue分别计算,总体计算,log2 转化后没有离群值
low.total.batched <- isOutlier(per.cell$sum, type="lower",
log=TRUE, batch=sce$tissue)
low.total.batched <- isOutlier(per.cell$sum, type="lower",
batch=sce$tissue)
low.total.batched <- isOutlier(per.cell$sum, type="lower",
log=TRUE)
summary(low.total.batched)
### 去除总counts数batch outlier的细胞
sce[,!low.total.batched]
5. 查看特征质量矩阵perFeatureQCMetrics
# Pretending that the first 10 cells are empty wells, for demonstration.
# 通过subset设定空wells,结果分别统计空/非空wells中的基因平均值等
per.feat <- perFeatureQCMetrics(sce, subsets=list(Empty=1:10))
summary(per.feat$mean)
summary(per.feat$detected)
summary(per.feat$subsets_Empty_ratio)
assay(sce)[,1:10]
## 每一个基因的在细胞中的平均count数
#Calculate the average count for each feature after normalizing observations
#using per-cell size factors.
ave <- calculateAverage(sce)
summary(ave)
6. 归一化normalization
#####normalization
# example_sce <- mockSCE()
# sizeFactors(example_sce)
# example_sce <- computeLibraryFactors(example_sce)
# sizeFactors(example_sce) <- librarySizeFactors(example_sce)
assay(example_sce)
assay(altExp(example_sce))
library(scRNAseq)
sce2 <- ZeiselBrainData()
colnames(colData(sce2))
library(scuttle)
sce2 <- quickPerCellQC(sce2, subsets=list(Mito=grep("mt-", rownames(sce))),
sub.fields=c("subsets_Mito_percent", "altexps_ERCC_percent"))
# 转化为dgCMatrix类型,快速运算
counts(sce) <- as(counts(sce), "dgCMatrix")
#Define per-cell size factors from the library sizes (i.e., total sum of counts per cell).
summary(librarySizeFactors(sce))
# Define per-cell size factors from the geometric mean of counts per cell.
#summary(geometricSizeFactors(sce))
## 对所有细胞归一化。添加sizeFactor到colData(sce)
sizeFactors(sce) <- librarySizeFactors(sce)
sce <- computeLibraryFactors(sce)
colnames(colData(sce))
## 跟据clusters分别计算sizeFactors
library(scran)
clusters <- quickCluster(sce)
# Scaling normalization of single-cell RNA-seq data by
# deconvolving size factors from cell pools.
sce <- computePooledFactors(sce, clusters=clusters)
summary(sizeFactors(sce))
sce2 <- computeSpikeFactors(sce, "ERCC")
summary(sizeFactors(sce2))
# assay(altExp(sce2,"ERCC"))[1:3,1:5]
7. 表达谱数据转化
sce <- logNormCounts(sce)
assayNames(sce)
#counts(sce)[1:3,1:2]
#logcounts(sce)[1:3,1:2]
assay(sce, "cpm") <- calculateCPM(sce)
# calculateFPKM 需要features的lengths
8. 其他函数
## 按细胞特征分组统计
agg.sce <- aggregateAcrossCells(sce, ids=sce$level1class)
#applySCE(sce, aggregateAcrossCells,ids=sce$level1class)
head(assay(agg.sce))
colData(agg.sce)[,c("ids", "ncells")]
agg.sce <- aggregateAcrossCells(sce,
ids=colData(sce)[,c("level1class", "tissue")])
head(assay(agg.sce))
colData(agg.sce)[,c("level1class", "tissue", "ncells")]
## 按基因特征分组统计
agg.feat <- sumCountsAcrossFeatures(sce,
ids=list(GeneSet1=1:10, GeneSet2=11:50, GeneSet3=1:100),
average=TRUE, exprs_values="logcounts")
agg.feat[,1:10]
# summarizeAssayByGroup
# aggregateAcrossFeatures
参考
http://www.bioconductor.org/packages/release/bioc/html/scuttle.html

1924

被折叠的 条评论
为什么被折叠?



