scuttle包对单细胞数据质控

scuttle软件包为单细胞组学数据分析提供了一些实用功能。这包括一些计算和过滤质量控制的简单方法;基础数据转换涉及各种类型的缩放规范化;以及跨单元或功能的灵活聚合。

1. 载入SingleCellExperiment数据

# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("scuttle")

library(scRNAseq)
sce <- ZeiselBrainData()
sce

2. 增加PerCellQCMetrics数据到colData

library(scuttle)
is.mito <- grep("mt-", rownames(sce))

##方法1

# subsets:A named list containing one or more vectors (a character vector 
# of feature names, a logical vector, 
# or a numeric vector of indices), used to identify interesting feature subsets 
# such as ERCC spike-in transcripts or mitochondrial genes.
per.cell <- perCellQCMetrics(sce, subsets=list(Mito=is.mito))
colnames(per.cell)
per.cell[1:3,]

# 过滤
#discard <- perCellQCFilters(per.cell)
#filtered_sce <- sce[,!discard$discard]

# 总counts
summary(per.cell$sum)
# 检测到的基因数量
summary(per.cell$detected)
# sum(assay(sce)[,"1772071015_C02"]>0)

summary(per.cell$subsets_Mito_percent)
summary(per.cell$altexps_ERCC_percent)

colData(sce) <- cbind(colData(sce), per.cell)
colnames(colData(sce))

## 方法2
sce <- addPerCellQCMetrics(sce, subsets=list(Mito=is.mito))
colnames(colData(sce))

# sce <- addPerCellQCMetrics(sce)
# colnames(colData(sce))

3. 根据CellQC过滤

## 方法1
# An example with just mitochondrial filters.
qc.stats <- perCellQCFilters(per.cell, sub.fields="subsets_Mito_percent") 
colSums(as.matrix(qc.stats))

# Another example with mitochondrial + spike-in filters.
qc.stats2 <- perCellQCFilters(per.cell, 
                              sub.fields=c("subsets_Mito_percent", 
                                           "altexps_ERCC_percent")) 
colSums(as.matrix(qc.stats2))

# 过滤
filtered_sce2 <- sce[,!qc.stats2$discard]

# 方法2
filtered_sce3 <- quickPerCellQC(sce, subsets=list(Mito=is.mito), 
                               sub.fields=c("subsets_Mito_percent", 
                                            "altexps_ERCC_percent"))

4 . 过滤outliers

# Convenience function to determine which values in a numeric vector are 
# outliers based on the median absolute deviation (MAD).
# 是否有低counts数(outlier)的单细胞
low.total <- isOutlier(per.cell$sum, type="lower", log=TRUE)
summary(low.total)

attr(low.total, "threshold")

### 去除总counts数outlier的细胞
sce[,!low.total]

# 按tissue分别计算,log2 转化后有离群值。
# 按tissue分别计算,不进行log2 转化,没有离群值。
# 不按tissue分别计算,总体计算,log2 转化后没有离群值
low.total.batched <- isOutlier(per.cell$sum, type="lower", 
                               log=TRUE, batch=sce$tissue)
low.total.batched <- isOutlier(per.cell$sum, type="lower", 
                               batch=sce$tissue)
low.total.batched <- isOutlier(per.cell$sum, type="lower", 
                               log=TRUE)

summary(low.total.batched)

### 去除总counts数batch outlier的细胞
sce[,!low.total.batched]

5. 查看特征质量矩阵perFeatureQCMetrics

# Pretending that the first 10 cells are empty wells, for demonstration.
#  通过subset设定空wells,结果分别统计空/非空wells中的基因平均值等
per.feat <- perFeatureQCMetrics(sce, subsets=list(Empty=1:10))
summary(per.feat$mean)
summary(per.feat$detected)
summary(per.feat$subsets_Empty_ratio)

assay(sce)[,1:10]

## 每一个基因的在细胞中的平均count数
#Calculate the average count for each feature after normalizing observations 
#using per-cell size factors.
ave <- calculateAverage(sce)
summary(ave)

6. 归一化normalization
 

#####normalization

# example_sce <- mockSCE()
# sizeFactors(example_sce)
# example_sce <- computeLibraryFactors(example_sce)
# sizeFactors(example_sce) <- librarySizeFactors(example_sce)
assay(example_sce)
assay(altExp(example_sce))

library(scRNAseq)
sce2 <- ZeiselBrainData()
colnames(colData(sce2))

library(scuttle)
sce2 <- quickPerCellQC(sce2, subsets=list(Mito=grep("mt-", rownames(sce))),
                      sub.fields=c("subsets_Mito_percent", "altexps_ERCC_percent")) 
# 转化为dgCMatrix类型,快速运算
counts(sce) <- as(counts(sce), "dgCMatrix")

#Define per-cell size factors from the library sizes (i.e., total sum of counts per cell).
summary(librarySizeFactors(sce))
# Define per-cell size factors from the geometric mean of counts per cell.
#summary(geometricSizeFactors(sce))

## 对所有细胞归一化。添加sizeFactor到colData(sce)
sizeFactors(sce) <- librarySizeFactors(sce)
sce <- computeLibraryFactors(sce)
colnames(colData(sce))

## 跟据clusters分别计算sizeFactors
library(scran)
clusters <- quickCluster(sce)

# Scaling normalization of single-cell RNA-seq data by 
# deconvolving size factors from cell pools.
sce <- computePooledFactors(sce, clusters=clusters)
summary(sizeFactors(sce))

sce2 <- computeSpikeFactors(sce, "ERCC")
summary(sizeFactors(sce2))

# assay(altExp(sce2,"ERCC"))[1:3,1:5]

7. 表达谱数据转化

sce <- logNormCounts(sce)
assayNames(sce)
#counts(sce)[1:3,1:2]
#logcounts(sce)[1:3,1:2]

assay(sce, "cpm") <- calculateCPM(sce)

# calculateFPKM 需要features的lengths

8. 其他函数

## 按细胞特征分组统计
agg.sce <- aggregateAcrossCells(sce, ids=sce$level1class)
#applySCE(sce, aggregateAcrossCells,ids=sce$level1class)
head(assay(agg.sce))
colData(agg.sce)[,c("ids", "ncells")]

agg.sce <- aggregateAcrossCells(sce, 
                                ids=colData(sce)[,c("level1class", "tissue")])
head(assay(agg.sce))
colData(agg.sce)[,c("level1class", "tissue", "ncells")]


## 按基因特征分组统计
agg.feat <- sumCountsAcrossFeatures(sce,
                                    ids=list(GeneSet1=1:10, GeneSet2=11:50, GeneSet3=1:100),
                                    average=TRUE, exprs_values="logcounts")
agg.feat[,1:10]

# summarizeAssayByGroup
# aggregateAcrossFeatures

参考

http://www.bioconductor.org/packages/release/bioc/html/scuttle.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值