单细胞蛋白组学数据处理

王元启的生信记录

已于 2024-05-30 16:21:07 修改

阅读量108

点赞数 1

分类专栏：单细胞蛋白组学数据处理文章标签： r语言数据分析经验分享

于 2024-05-30 16:02:51 首次发布

单细胞蛋白组学数据处理专栏收录该内容

13 篇文章 0 订阅

订阅专栏

今天复现单细胞蛋白组学数据分析流程，scp数据分析流程(R语言），欢迎大家交流学习！！！
原文章链接：https://currentprotocols.onlinelibrary.wiley.com/doi/10.1002/cpz1.658.
文章名称：在这里插入图片描述
单细胞蛋白数据分析流程：

数据格式要求。

2.载入相关的R安装包

library("scp")
library("ggplot2")
library("dplyr")
library("impute")

3.读取scp数据

data("mqScpData")
data("sampleAnnotation")
table(sampleAnnotation$SampleType)
scp <- readSCP(assayData = mqScpData,
               colData = sampleAnnotation,
               runCol = "Raw.file",
               removeEmptyCols = TRUE)
plot(scp)

4.数据前处理

####将0值全部替换为NA值，方便进行下游分析
scp <- zeroIsNA(scp, i = 1:4)
rowDataNames(scp)
####进行数据过滤，对肽段匹配图谱数据进行过滤，保留质量较好的数据
scp <- filterFeatures(scp,
                      ~ Reverse != "+" &
                        Potential.contaminant != "+" &
                        !is.na(PIF) & PIF > 0.8)
keepAssay <- dims(scp)[1, ] > 150
scp <- scp[, , keepAssay]
table(colData(scp)[, "SampleType"])
scp <- computeSCR(scp,
                  i = 1:3,
                  colvar = "SampleType",
                  carrierPattern = "Carrier",
                  samplePattern = "Macrophage|Monocyte",
                  sampleFUN = "mean",
                  rowDataName = "MeanSCR")
#######查看数据#########
Data = rbindRowData(scp, i = 1:3) |> data.frame()

rbindRowData(scp, i = 1:3) |>
  data.frame() |>
  ggplot(aes(x = MeanSCR)) +
  geom_histogram() +
  geom_vline(xintercept = c(1/200, 0.1),
             lty = c(2, 1)) +
  scale_x_log10()
###########数据过滤######
scp <- filterFeatures(scp,
                      ~ !is.na(MeanSCR1) &
                        MeanSCR < 0.1)
scp <- pep2qvalue(scp,
                  i = 1:3,
                  PEP = "dart_PEP",
                  rowDataName = "qvalue_PSMs")
scp <- pep2qvalue(scp,
                  i = 1:3,
                  PEP = "dart_PEP",
                  groupBy = "Leading.razor.protein",
                  rowDataName = "qvalue_proteins")
scp <- filterFeatures(scp,
                      ~ qvalue_proteins < 0.01)

5.处理PSM数据

scp <- divideByReference(scp,
                         i = 1:3,
                         colvar = "SampleType",
                         samplePattern = ".",
                         refPattern = "Reference")

6.将PSM数据转换为肽段数据

scp <- aggregateFeaturesOverAssays(scp,
                                   i = 1:3,
                                   fcol = "Modified.sequence",
                                   name = paste0("peptides_", names(scp)),
                                   fun = matrixStats::colMedians, na.rm = TRUE)
plot(scp)
####加入SCoPE2 数据
scp <- joinAssays(scp,
                  i = 4:6,
                  name = "peptides")
plot(scp)

7.单细胞肽段数据过滤

#######过滤单细胞样本类型数据#####
scp <- scp[, scp$SampleType %in% c("Blank", "Macrophage", "Monocyte"), ]

#######基于中位数相对强度进行过滤#####

medians <- colMedians(assay(scp[["peptides"]]), na.rm = TRUE)
scp$MedianRI <- medians
colData(scp) |> data.frame() |> ggplot() +
  aes(x = MedianRI,
      y = SampleType,
      fill = SampleType) +
  geom_boxplot() +
  scale_x_log10()
#########基于中位数CV进行过滤#####
scp <- medianCVperCell(scp,
                       i = 1:3,
                       groupBy = "Leading.razor.protein",
                       nobs = 5,
                       norm = "div.median",
                       na.rm = TRUE,
                       colDataName = "MedianCV")
getWithColData(scp, "peptides") |>
  colData() |>
  data.frame() |>
  ggplot(aes(x = MedianCV,
             fill = SampleType)) +
  geom_boxplot() +
  geom_vline(xintercept = 0.65)
scp <- scp[, !is.na(scp$MedianCV) & scp$MedianCV < 0.65, ]
scp <- scp[, scp$SampleType != "Blank", ]

8.单细胞肽段数据前处理

#########归一化
## Divide columns by median
scp <- sweep(scp,
             i = "peptides",
             MARGIN = 2,
             FUN = "/",
             STATS = colMedians(assay(scp[["peptides"]]), na.rm = TRUE),
             name = "peptides_norm_col")
## Divide rows by mean
scp <- sweep(scp,
             i = "peptides_norm_col",
             MARGIN = 1,
             FUN = "/",
             STATS = rowMeans(assay(scp[["peptides_norm_col"]]),  na.rm = TRUE),
             name = "peptides_norm")
plot(scp)
###########移除缺失值超过0.99的#####
scp <- filterNA(scp,
                i = "peptides_norm",
                pNA = 0.99)
#########log转换#########
scp <- logTransform(scp,
                    base = 2,
                    i = "peptides_norm",
                    name = "peptides_log")

9.肽段数据转换为蛋白数据

scp <- aggregateFeatures(scp,
                         i = "peptides_log",
                         name = "proteins",
                         fcol = "Leading.razor.protein",
                         fun = matrixStats::colMedians, na.rm = TRUE)

10.单细胞蛋白数据前处理

## Center columns with median
scp <- sweep(scp, i = "proteins",
             MARGIN = 2,
             FUN = "-",
             STATS = colMedians(assay(scp[["proteins"]]),
                                na.rm = TRUE),
             name = "proteins_norm_col")
## Center rows with mean
scp <- sweep(scp, i = "proteins_norm_col",
             MARGIN = 1,
             FUN = "-",
             STATS = rowMeans(assay(scp[["proteins_norm_col"]]),
                              na.rm = TRUE),
             name = "proteins_norm")

scp[["proteins_norm"]] |>
  assay() |>
  is.na() |>
  mean()
###### missing data impute
scp <- impute(scp,
              i = "proteins_norm",
              name = "proteins_imptd",
              method = "knn",
              k = 3, rowmax = 1, colmax= 1,
              maxp = Inf, rng.seed = 1234)

scp[["proteins_imptd"]] |>
  assay() |>
  is.na() |>
  mean()

######batch correaction
sce <- getWithColData(scp, "proteins_imptd")

batch <- sce$runCol
model <- model.matrix(~ SampleType, data = colData(sce))
library(sva)
assay(sce) <- ComBat(dat = assay(sce),
                     batch = batch,
                     mod = model)
scp <- addAssay(scp,
                y = sce,
                name = "proteins_batchC")
scp <- addAssayLinkOneToOne(scp,
                            from = "proteins_imptd",
                            to = "proteins_batchC")

plot(scp)

11.降维分析

########降维分析
library(scater)
scp[["proteins_batchC"]] <- runPCA(scp[["proteins_batchC"]],
                                   ncomponents = 5,
                                   ntop = Inf,
                                   scale = TRUE,
                                   exprs_values = 1,
                                   name = "PCA")
plotReducedDim(scp[["proteins_batchC"]],
               dimred = "PCA",
               colour_by = "SampleType",
               point_alpha = 1)

scp[["proteins_batchC"]] <- runUMAP(scp[["proteins_batchC"]],
                                    ncomponents = 2,
                                    ntop = Inf,
                                    scale = TRUE,
                                    exprs_values = 1,
                                    n_neighbors = 3,
                                    dimred = "PCA",
                                    name = "UMAP")

plotReducedDim(scp[["proteins_batchC"]],
               dimred = "UMAP",
               colour_by = "SampleType",
               point_alpha = 1)

12.监测数据处理结果

subsetByFeature(scp, "P13796") |>
  ## Format the `QFeatures` to a long format table
  longFormat(colvars = c("runCol", "SampleType", "quantCols")) |>
  data.frame() |>
  ## This is used to preserve ordering of the samples and assays in ggplot2
  mutate(assay = factor(assay, levels = names(scp)),
         Channel = sub("Reporter.intensity.", "TMT-", quantCols),
         Channel = factor(Channel, levels = unique(Channel))) |>
  ## Start plotting
  ggplot(aes(x = Channel, y = value, group = rowname, col = SampleType)) +
  geom_point() +
  ## Plot every assay in a separate facet
  facet_wrap(facets = vars(assay), scales = "free_y", ncol = 3) +
  ## Annotate plot
  xlab("Channels") +
  ylab("Intensity (arbitrary units)") +
  ## Improve plot aspect
  theme(axis.text.x = element_text(angle = 90),
        strip.text = element_text(hjust = 0),
        legend.position = "bottom")