今天复现单细胞蛋白组学数据分析流程,scp数据分析流程(R语言),欢迎大家交流学习!!!
原文章链接:https://currentprotocols.onlinelibrary.wiley.com/doi/10.1002/cpz1.658.
文章名称:
单细胞蛋白数据分析流程:
- 数据格式要求。
2.载入相关的R安装包
library("scp")
library("ggplot2")
library("dplyr")
library("impute")
3.读取scp数据
data("mqScpData")
data("sampleAnnotation")
table(sampleAnnotation$SampleType)
scp <- readSCP(assayData = mqScpData,
colData = sampleAnnotation,
runCol = "Raw.file",
removeEmptyCols = TRUE)
plot(scp)
4.数据前处理
####将0值全部替换为NA值,方便进行下游分析
scp <- zeroIsNA(scp, i = 1:4)
rowDataNames(scp)
####进行数据过滤,对肽段匹配图谱数据进行过滤,保留质量较好的数据
scp <- filterFeatures(scp,
~ Reverse != "+" &
Potential.contaminant != "+" &
!is.na(PIF) & PIF > 0.8)
keepAssay <- dims(scp)[1, ] > 150
scp <- scp[, , keepAssay]
table(colData(scp)[, "SampleType"])
scp <- computeSCR(scp,
i = 1:3,
colvar = "SampleType",
carrierPattern = "Carrier",
samplePattern = "Macrophage|Monocyte",
sampleFUN = "mean",
rowDataName = "MeanSCR")
#######查看数据#########
Data = rbindRowData(scp, i = 1:3) |> data.frame()
rbindRowData(scp, i = 1:3) |>
data.frame() |>
ggplot(aes(x = MeanSCR)) +
geom_histogram() +
geom_vline(xintercept = c(1/200, 0.1),
lty = c(2, 1)) +
scale_x_log10()
###########数据过滤######
scp <- filterFeatures(scp,
~ !is.na(MeanSCR1) &
MeanSCR < 0.1)
scp <- pep2qvalue(scp,
i = 1:3,
PEP = "dart_PEP",
rowDataName = "qvalue_PSMs")
scp <- pep2qvalue(scp,
i = 1:3,
PEP = "dart_PEP",
groupBy = "Leading.razor.protein",
rowDataName = "qvalue_proteins")
scp <- filterFeatures(scp,
~ qvalue_proteins < 0.01)
5.处理PSM数据
scp <- divideByReference(scp,
i = 1:3,
colvar = "SampleType",
samplePattern = ".",
refPattern = "Reference")
6.将PSM数据转换为肽段数据
scp <- aggregateFeaturesOverAssays(scp,
i = 1:3,
fcol = "Modified.sequence",
name = paste0("peptides_", names(scp)),
fun = matrixStats::colMedians, na.rm = TRUE)
plot(scp)
####加入SCoPE2 数据
scp <- joinAssays(scp,
i = 4:6,
name = "peptides")
plot(scp)
7.单细胞肽段数据过滤
#######过滤单细胞样本类型数据#####
scp <- scp[, scp$SampleType %in% c("Blank", "Macrophage", "Monocyte"), ]
#######基于中位数相对强度进行过滤#####
medians <- colMedians(assay(scp[["peptides"]]), na.rm = TRUE)
scp$MedianRI <- medians
colData(scp) |> data.frame() |> ggplot() +
aes(x = MedianRI,
y = SampleType,
fill = SampleType) +
geom_boxplot() +
scale_x_log10()
#########基于中位数CV进行过滤#####
scp <- medianCVperCell(scp,
i = 1:3,
groupBy = "Leading.razor.protein",
nobs = 5,
norm = "div.median",
na.rm = TRUE,
colDataName = "MedianCV")
getWithColData(scp, "peptides") |>
colData() |>
data.frame() |>
ggplot(aes(x = MedianCV,
fill = SampleType)) +
geom_boxplot() +
geom_vline(xintercept = 0.65)
scp <- scp[, !is.na(scp$MedianCV) & scp$MedianCV < 0.65, ]
scp <- scp[, scp$SampleType != "Blank", ]
8.单细胞肽段数据前处理
#########归一化
## Divide columns by median
scp <- sweep(scp,
i = "peptides",
MARGIN = 2,
FUN = "/",
STATS = colMedians(assay(scp[["peptides"]]), na.rm = TRUE),
name = "peptides_norm_col")
## Divide rows by mean
scp <- sweep(scp,
i = "peptides_norm_col",
MARGIN = 1,
FUN = "/",
STATS = rowMeans(assay(scp[["peptides_norm_col"]]), na.rm = TRUE),
name = "peptides_norm")
plot(scp)
###########移除缺失值超过0.99的#####
scp <- filterNA(scp,
i = "peptides_norm",
pNA = 0.99)
#########log转换#########
scp <- logTransform(scp,
base = 2,
i = "peptides_norm",
name = "peptides_log")
9.肽段数据转换为蛋白数据
scp <- aggregateFeatures(scp,
i = "peptides_log",
name = "proteins",
fcol = "Leading.razor.protein",
fun = matrixStats::colMedians, na.rm = TRUE)
10.单细胞蛋白数据前处理
## Center columns with median
scp <- sweep(scp, i = "proteins",
MARGIN = 2,
FUN = "-",
STATS = colMedians(assay(scp[["proteins"]]),
na.rm = TRUE),
name = "proteins_norm_col")
## Center rows with mean
scp <- sweep(scp, i = "proteins_norm_col",
MARGIN = 1,
FUN = "-",
STATS = rowMeans(assay(scp[["proteins_norm_col"]]),
na.rm = TRUE),
name = "proteins_norm")
scp[["proteins_norm"]] |>
assay() |>
is.na() |>
mean()
###### missing data impute
scp <- impute(scp,
i = "proteins_norm",
name = "proteins_imptd",
method = "knn",
k = 3, rowmax = 1, colmax= 1,
maxp = Inf, rng.seed = 1234)
scp[["proteins_imptd"]] |>
assay() |>
is.na() |>
mean()
######batch correaction
sce <- getWithColData(scp, "proteins_imptd")
batch <- sce$runCol
model <- model.matrix(~ SampleType, data = colData(sce))
library(sva)
assay(sce) <- ComBat(dat = assay(sce),
batch = batch,
mod = model)
scp <- addAssay(scp,
y = sce,
name = "proteins_batchC")
scp <- addAssayLinkOneToOne(scp,
from = "proteins_imptd",
to = "proteins_batchC")
plot(scp)
11.降维分析
########降维分析
library(scater)
scp[["proteins_batchC"]] <- runPCA(scp[["proteins_batchC"]],
ncomponents = 5,
ntop = Inf,
scale = TRUE,
exprs_values = 1,
name = "PCA")
plotReducedDim(scp[["proteins_batchC"]],
dimred = "PCA",
colour_by = "SampleType",
point_alpha = 1)
scp[["proteins_batchC"]] <- runUMAP(scp[["proteins_batchC"]],
ncomponents = 2,
ntop = Inf,
scale = TRUE,
exprs_values = 1,
n_neighbors = 3,
dimred = "PCA",
name = "UMAP")
plotReducedDim(scp[["proteins_batchC"]],
dimred = "UMAP",
colour_by = "SampleType",
point_alpha = 1)
12.监测数据处理结果
subsetByFeature(scp, "P13796") |>
## Format the `QFeatures` to a long format table
longFormat(colvars = c("runCol", "SampleType", "quantCols")) |>
data.frame() |>
## This is used to preserve ordering of the samples and assays in ggplot2
mutate(assay = factor(assay, levels = names(scp)),
Channel = sub("Reporter.intensity.", "TMT-", quantCols),
Channel = factor(Channel, levels = unique(Channel))) |>
## Start plotting
ggplot(aes(x = Channel, y = value, group = rowname, col = SampleType)) +
geom_point() +
## Plot every assay in a separate facet
facet_wrap(facets = vars(assay), scales = "free_y", ncol = 3) +
## Annotate plot
xlab("Channels") +
ylab("Intensity (arbitrary units)") +
## Improve plot aspect
theme(axis.text.x = element_text(angle = 90),
strip.text = element_text(hjust = 0),
legend.position = "bottom")
如有疑问,欢迎咨询:kriswcyYQ,欢迎关注微信公众号:质绘元。