monocle3包分析单细胞转录组数据

最新推荐文章于 2025-04-20 00:25:25 发布

qq_27390023

最新推荐文章于 2025-04-20 00:25:25 发布

阅读量6.7k

点赞数 6

文章标签： r语言生物信息学

本文链接：https://blog.csdn.net/qq_27390023/article/details/121784718

版权

本文详细介绍了单细胞RNA测序数据分析的全过程，包括数据预处理、降维、聚类、细胞类型注释和差异表达基因分析。使用monocle3库进行操作，涵盖new_cell_data_set对象构建、批次效应校正、降维方法如UMAP和tSNE、聚类算法如leiden和louvain，以及单细胞轨迹推断。同时，讲解了如何寻找和分析细胞亚群的marker基因，以及利用Garnett进行自动化细胞类型注释。此外，还展示了沿着伪时间轨迹的基因表达分析和3D轨迹图的构建。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1. 构建new_cell_data_set对象

Usage

new_cell_data_set(expression_data, cell_metadata = NULL, gene_metadata = NULL)

Arguments

`expression_data`	expression data matrix for an experiment, can be a sparseMatrix.
`cell_metadata`	data frame containing attributes of individual cells, where `row.names(cell_metadata) = colnames(expression_data)`.
`gene_metadata`	data frame containing attributes of features (e.g. genes), where `row.names(gene_metadata) = row.names(expression_data)`.

# 设置工作目录
setwd("your/working/path")

# 载入包
library(monocle3)
# packageVersion('monocle3') # monocle版本
# ls("package:monocle3")
library(ggplot2)
library(dplyr)

# 1.分别读入表达矩阵，细胞注释数据框以及基因注释数据框，并构建new_cell_data_set对象
# Load the data
expression_matrix <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_expression.rds"))
cell_metadata <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_colData.rds"))
gene_annotation <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_rowData.rds"))

# Make the CDS object
cds <- new_cell_data_set(expression_matrix,
                         cell_metadata = cell_metadata,
                         gene_metadata = gene_annotation)

# 如果表达矩阵不是"dgCMatrix"，要转为sparseMatrix，加快下游的数据运算。
# cds <- new_cell_data_set(as(umi_matrix, "sparseMatrix"),
#                          cell_metadata = cell_metadata,
#                          gene_metadata = gene_metadata)


## 2. 从CellRanger的输出构建new_cell_data_set对象
## Provide the path to the Cell Ranger output.
#cds <- load_cellranger_data("~/Downloads/10x_data")
#cds <- load_mm_data(mat_path = "~/Downloads/matrix.mtx", 
#                    feature_anno_path = "~/Downloads/features.tsv", 
#                    cell_anno_path = "~/Downloads/barcodes.tsv")


## 3. 合并new_cell_data_set对象
## combine cds objects
# make a fake second cds object for demonstration
#cds2 <- cds[1:100,]
#big_cds <- combine_cds(list(cds, cds2))

2. 数据预处理，降维

可以先降维看是否有批次效应，消除批次效应后要重新进行降维运算。

Usage

plot_cells(
  cds,
  x = 1,
  y = 2,
  reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"),
  color_cells_by = "cluster",
  group_cells_by = c("cluster", "partition"),
  genes = NULL,
  show_trajectory_graph = TRUE,
  trajectory_graph_color = "grey28",
  trajectory_graph_segment_size = 0.75,
  norm_method = c("log", "size_only"),
  label_cell_groups = TRUE,
  label_groups_by_cluster = TRUE,
  group_label_size = 2,
  labels_per_group = 1,
  label_branch_points = TRUE,
  label_roots = TRUE,
  label_leaves = TRUE,
  graph_label_size = 2,
  cell_size = 0.35,
  cell_stroke = I(cell_size/2),
  alpha = 1,
  min_expr = 0.1,
  rasterize = FALSE,
  scale_to_range = FALSE,
  label_principal_points = FALSE
)

# 表型数据
pData(cds)
# 表达矩阵
exprs(cds)

# preprocess_cds首先通过log和size factor (或仅仅size factor )对数据进行标准化，以解决深度差# 异。接下来，preprocess_cds计算一个低维空间，该空间将用作进一步降维的输入，如tSNE和UMAP。
cds <- preprocess_cds(cds, num_dim = 100)

# whether the PCs capture most of the variation 
# in gene expression across all the cells in the data set?
plot_pc_variance_explained(cds)

# reduce_dimension,用pca（n=100）数据降维

# 单细胞数量 >10,000, 设置umap.fast_sgd=TRUE，加快计算速度
cds <- reduce_dimension(cds)
#cds <- reduce_dimension(cds,reduction_method="tSNE")

#获得降纬度后的数据矩阵
reducedDim(cds, "PCA")
reducedDim(cds, "UMAP")

# 根据UMAP作图
plot_cells(cds)
#plot_cells(cds,reduction_method="tSNE")

#check for batch effects
plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE)

# 去除批次效应
cds = align_cds(cds, num_dim = 100, alignment_group = "plate")
plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE)

# 根据表型加上颜色
#colnames(pData(cds))
plot_cells(cds, color_cells_by="cao_cell_type")
## 不加label
#plot_cells(cds, color_cells_by="cao_cell_type",label_cell_groups=FALSE)

#plot_cells(cds, reduction_method="tSNE",color_cells_by="cao_cell_type") 

# 显示各细胞中特定基因的表达量
plot_cells(cds, genes=c("cpna-2", "egl-21", "ram-2", "inos-1"))

3. 聚类分析

cluster_cells(
  cds,
  reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"),
  k = 20,
  cluster_method = c("leiden", "louvain"),
  num_iter = 2,
  partition_qval = 0.05,
  weight = FALSE,
  resolution = NULL,
  random_seed = NULL,
  verbose = F,
  ...
)

### 聚类
cds = cluster_cells(cds, resolution=1e-5)
# 根据cluster标记颜色
plot_cells(cds)
# cds = cluster_cells(cds, reduction_method="tSNE",resolution=1e-5)
# plot_cells(cds,reduction_method="tSNE")

# 根据partition标记颜色
plot_cells(cds, color_cells_by="partition", group_cells_by="partition")

# plot_cells(cds, color_cells_by="cao_cell_type")

### Find marker genes expressed by each cluster
marker_test_res <- top_markers(cds, group_cells_by="partition", 
                               reference_cells=1000, cores=8)
# colnames(marker_test_res)
top_specific_markers <- marker_test_res %>%
  filter(fraction_expressing >= 0.10) %>%
  group_by(cell_group) %>%
  top_n(1, pseudo_R2)

top_specific_marker_ids <- unique(top_specific_markers %>% pull(gene_id))
# plot the expression and fraction of cells that express each marker in each group 
# 比较不同partition/cluster中marker基因的平均表达量
plot_genes_by_group(cds,
                    top_specific_marker_ids,
                    group_cells_by="partition",
                    ordering_type="maximal_on_diag",
                    max.size=3)

4. 细胞类型注释

细胞类型的注释是研究中最难的部分，可能含有新发现的细胞亚群，需要专家知识。细胞亚群往往需要进一步聚类，注释，最后合并一张图上（包含所有的细胞群及精细的注释信息）。

### 1. 手工注释（根据marker基因的表达...）。本例中仅仅把colData中的细胞类型注释到不同的cluster
colData(cds)$assigned_cell_type <- as.character(partitions(cds))
# rename细胞类型
colData(cds)$assigned_cell_type = dplyr::recode(colData(cds)$assigned_cell_type,
                                                "1"="Germline",
                                                "2"="Body wall muscle",
                                                "3"="Unclassified neurons",
                                                "4"="Vulval precursors",
                                                "5"="Failed QC",
                                                "6"="Seam cells",
                                                "7"="Pharyngeal epithelia",
                                                "8"="Coelomocytes",
                                                "9"="Am/PH sheath cells",
                                                "10"="Failed QC",
                                                "11"="Touch receptor neurons",
                                                "12"="Intestinal/rectal muscle",
                                                "13"="Pharyngeal neurons",
                                                "14"="NA",
                                                "15"="flp-1(+) interneurons",
                                                "16"="Canal associated neurons",
                                                "17"="Ciliated sensory neurons",
                                                "18"="Other interneurons",
                                                "19"="Pharyngeal gland",
                                                "20"="Failed QC",
                                                "21"="Ciliated sensory neurons",
                                                "22"="Oxygen sensory neurons",
                                                "23"="Ciliated sensory neurons",
                                                "24"="Ciliated sensory neurons",
                                                "25"="Ciliated sensory neurons",
                                                "26"="Ciliated sensory neurons",
                                                "27"="Oxygen sensory neurons",
                                                "28"="Ciliated sensory neurons",
                                                "29"="Unclassified neurons",
                                                "30"="Socket cells",
                                                "31"="Failed QC",
                                                "32"="Pharyngeal gland",
                                                "33"="Ciliated sensory neurons",
                                                "34"="Ciliated sensory neurons",
                                                "35"="Ciliated sensory neurons",
                                                "36"="Failed QC",
                                                "37"="Ciliated sensory neurons",
                                                "38"="Pharyngeal muscle")

plot_cells(cds, group_cells_by="partition", color_cells_by="assigned_cell_type")

# 交互选择子细胞群
cds_subset <- choose_cells(cds)

## 对子细胞群进行聚类
cds_subset = cluster_cells(cds_subset, resolution=1e-2)
plot_cells(cds_subset, color_cells_by="cluster")
plot_cells(cds, color_cells_by="cluster")

# 注释子细胞群中的细胞类型
colData(cds_subset)$assigned_cell_type <- as.character(clusters(cds_subset))
colData(cds_subset)$assigned_cell_type <- dplyr::recode(colData(cds_subset)$assigned_cell_type,
                                                        "1"="Somatic gonad precursors",
                                                        "2"="Somatic gonad precursors",
                                                        "3"="Vulval precursors",
                                                        "4"="Sex myoblasts",
                                                        "5"="Sex myoblasts",
                                                        "6"="Vulval precursors",
                                                        "7"="Failed QC",
                                                        "8"="Vulval precursors",
                                                        "10"="Unclassified neurons",
                                                        "11"="Distal tip cells")
plot_cells(cds_subset, group_cells_by="cluster", 
           color_cells_by="assigned_cell_type")

# 在所有细胞群图中显示子细胞群的注释信息
colData(cds)[colnames(cds_subset),]$assigned_cell_type <- colData(cds_subset)$assigned_cell_type
cds <- cds[,colData(cds)$assigned_cell_type != "Failed QC" | is.na(colData(cds)$assigned_cell_type )]
plot_cells(cds, group_cells_by="partition", 
           color_cells_by="assigned_cell_type", 
           labels_per_group=5)

### 2. Automated annotation with Garnett
assigned_type_marker_test_res <- top_markers(cds,
                                             group_cells_by="assigned_cell_type",
                                             reference_cells=1000,
                                             cores=8)
# Require that markers have at least JS specificty score > 0.5 and
# be significant in the logistic test for identifying their cell type:
garnett_markers <- assigned_type_marker_test_res %>%
  filter(marker_test_q_value < 0.01 & specificity >= 0.5) %>%
  group_by(cell_group) %>%
  top_n(5, marker_score)
# Exclude genes that are good markers for more than one cell type:
garnett_markers <- garnett_markers %>% 
  group_by(gene_short_name) %>%
  filter(n() == 1)

# 生成用于细胞类型注释的文件，可以根据专业知识修改用于细胞类型识别的基因
generate_garnett_marker_file(garnett_markers, file="./marker_file.txt")
colData(cds)$garnett_cluster <- clusters(cds)

# 安装相应的包
BiocManager::install(c("org.Mm.eg.db", "org.Hs.eg.db")) # garnett依赖的包
devtools::install_github("cole-trapnell-lab/garnett", ref="monocle3")
BiocManager::install("org.Ce.eg.db")
library(garnett)

# 训练分类器，训练好的分类器可以保存起来用于其他相似数据的注释
worm_classifier <- train_cell_classifier(cds = cds,
                                         marker_file = "./marker_file.txt", 
                                         db=org.Ce.eg.db::org.Ce.eg.db,
                                         cds_gene_id_type = "ENSEMBL",
                                         num_unknown = 50,
                                         marker_file_gene_id_type = "SYMBOL",
                                         cores=8)

#saveRDS(worm_classifier,"worm_classifier.RDS")
#worm_classifier <- readRDS("worm_classifier.RDS")

# 根据分类器进行细胞类型注释
cds <- classify_cells(cds, worm_classifier,
                      db = org.Ce.eg.db::org.Ce.eg.db,
                      cluster_extend = TRUE,
                      cds_gene_id_type = "ENSEMBL")
plot_cells(cds,
           group_cells_by="partition",
           color_cells_by="cluster_ext_type")

5.差异表达基因分析

monocle3的差异分析有两种方法：
回归分析：使用fit_models（），您可以评估每个基因是否取决于时间、治疗等变量。
图形自相关分析：使用Graph_test（），您可以找到在轨迹上或簇之间变化的基因。

# 演示数据，取子集，对部分基因做差异表达分析
ciliated_genes <- c("che-1",
                    "hlh-17",
                    "nhr-6",
                    "dmd-6",
                    "ceh-36",
                    "ham-1")
cds_subset <- cds[rowData(cds)$gene_short_name %in% ciliated_genes,]


### 1.回归分析差异表达基因
gene_fits <- fit_models(cds_subset, model_formula_str = "~embryo.time")
# fit_models:该函数适用于细胞数据集中每个基因的广义线性模型。
#  可以提供公式来解释额外的协变量 ~embryo.time + batch
# （例如收集的天数、细胞基因型、培养基条件等）。

# 主要参数1: model_formula_str 可以是~cluster or ~partition等colData列名
# 主要参数2:expression_family:Specifies the family function used for expression responses. 
#Can be one of "quasipoisson", "negbinomial", "poisson", "binomial", "gaussian",
#"zipoisson", or "zinegbinomial". Default is "quasipoisson".

class(gene_fits) # "tbl_df"     "tbl"        "data.frame" 

fit_coefs <- coefficient_table(gene_fits)
emb_time_terms <- fit_coefs %>% filter(term == "embryo.time")

emb_time_terms %>% filter (q_value < 0.05) %>%
  select(gene_short_name, term, q_value, estimate)

plot_genes_violin(cds_subset, group_cells_by="embryo.time.bin", ncol=2) +
  theme(axis.text.x=element_text(angle=45, hjust=1))

# control batch effect
gene_fits <- fit_models(cds_subset, model_formula_str = "~embryo.time + batch")
# gene_fits2 <- fit_models(cds_subset, model_formula_str = "~cluster+ batch")
fit_coefs <- coefficient_table(gene_fits)
fit_coefs %>% filter(term != "(Intercept)") %>%
  select(gene_short_name, term, q_value, estimate)

evaluate_fits(gene_fits)

# 模型比较，是否存在批次效应
time_batch_models <- fit_models(cds_subset,
                                model_formula_str = "~embryo.time + batch",
                                expression_family="negbinomial")
time_models <- fit_models(cds_subset,
                          model_formula_str = "~embryo.time",
                          expression_family="negbinomial")
compare_models(time_batch_models, time_models) %>% select(gene_short_name, q_value)

# 比较结果：所有基因的似然比检验都是显著的，这表明数据中存在大量的批量效应。
# 因此，我们有理由将批处理项添加到我们的模型中。

### 2.图形自相关分析差异表达基因
# 依据子细胞群在低维空间的位置，运用Moran’s I 检验，检测空间相关的差异表达基因
pr_graph_test_res <- graph_test(cds_subset, neighbor_graph="knn", cores=8)
pr_deg_ids <- row.names(subset(pr_graph_test_res, 
                               morans_I > 0.01 & q_value < 0.05))
#将基因聚集成跨细胞共表达的模块
gene_module_df <- find_gene_modules(cds_subset[pr_deg_ids,], resolution=1e-3)
plot_cells(cds_subset, genes=gene_module_df, 
           show_trajectory_graph=FALSE, 
           label_cell_groups=FALSE)

6. 单细胞伪时间轨迹分析

####### 单细胞伪时间轨迹分析#######
###1.构建cell_data_set对象
expression_matrix <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_expression.rds"))
cell_metadata <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_colData.rds"))
gene_annotation <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_rowData.rds"))

cds <- new_cell_data_set(expression_matrix,
                         cell_metadata = cell_metadata,
                         gene_metadata = gene_annotation)

### 2.预处理
#normalization and dimension reduction.

cds <- preprocess_cds(cds, num_dim = 50)
# plot_pc_variance_explained(cds) # 查看pca的分量解释多少变异

# remove batch effects
head(colData(cds))
cds <- align_cds(cds, alignment_group = "batch", 
                 residual_model_formula_str = "~ bg.300.loading + bg.400.loading + bg.500.1.loading + bg.500.2.loading + bg.r17.loading + bg.b01.loading + bg.b02.loading")

# alignment_group:指定用于对齐单元格组的colData列的字符串。
# 指定的列必须是一个因子。对齐可用于以非线性方式减去批次效果。
# 要更正连续效果，请使用残差模型公式。默认值为空。

# residual_model_formula_str:NULL或字符串模型公式，
# 指定在降维之前从数据中减去的任何影响。使用线性模型减去效果。
# 对于非线性效果，请使用对齐组。默认值为空。    

# 使用pca处理的数据进一步降维以便聚类和轨迹推断
cds <- reduce_dimension(cds)
plot_cells(cds, label_groups_by_cluster=FALSE,  color_cells_by = "cell.type")

# 图中显示基因表达情况
ciliated_genes <- c("che-1",
                    "hlh-17",
                    "nhr-6",
                    "dmd-6",
                    "ceh-36",
                    "ham-1")

plot_cells(cds,
           genes=ciliated_genes,
           label_cell_groups=FALSE,
           show_trajectory_graph=FALSE)

### 3.单细胞聚类
cds <- cluster_cells(cds)
plot_cells(cds, color_cells_by = "partition")
# partition和cluster有区别
# plot_cells(cds, color_cells_by = "cluster")

### 4. 学习伪时间轨迹图
# 伪时间是衡量单个细胞在细胞分化等过程中所取得进展的指标。
cds <- learn_graph(cds)
plot_cells(cds,
           color_cells_by = "cell.type",
           label_groups_by_cluster=FALSE,
           label_leaves=FALSE,
           label_branch_points=FALSE)

# 在轨迹图标记胚胎时间段，
# 黑线表示图形的结构，黑圈表示分支节点，浅灰色圆圈表示细胞状态
plot_cells(cds,
           color_cells_by = "embryo.time.bin",
           label_cell_groups=FALSE,
           label_leaves=TRUE,
           label_branch_points=TRUE,
           graph_label_size=1.5)
# 交互选择根节点（最初状态的细胞）
cds <- order_cells(cds)

# 伪时间轨迹图作图，灰色表示无限的伪时间，因为从拾取的根节点无法访问它们。
# 一般一个partition选择一个root，就能解决无限伪时间问题
plot_cells(cds,
           color_cells_by = "pseudotime",
           label_cell_groups=FALSE,
           label_leaves=FALSE,
           label_branch_points=FALSE,
           graph_label_size=1.5)

# # 不显示轨迹图
# plot_cells(cds,
#            color_cells_by = "pseudotime",
#            label_cell_groups=FALSE,
#            label_leaves=FALSE,
#            label_branch_points=FALSE,
#            show_trajectory_graph = FALSE)
      
## 自动选择根节点（root）
# a helper function to identify the root principal points:
get_earliest_principal_node <- function(cds, time_bin="130-170"){
  cell_ids <- which(colData(cds)[, "embryo.time.bin"] == time_bin)
  
  closest_vertex <-
    cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
    igraph::V(principal_graph(cds)[["UMAP"]])$name[as.numeric(names
                                                              (which.max(table(closest_vertex[cell_ids,]))))]
  
  root_pr_nodes
}
cds <- order_cells(cds, root_pr_nodes=get_earliest_principal_node(cds))

plot_cells(cds,
           color_cells_by = "pseudotime",
           label_cell_groups=FALSE,
           label_leaves=FALSE,
           label_branch_points=FALSE,
           show_trajectory_graph = FALSE)

# 沿轨迹图交互选择起始节点以及终节点的单细胞
cds_sub <- choose_graph_segments(cds)

### 5. 单细胞轨迹图上显示差异表达基因的表达量
ciliated_cds_pr_test_res <- graph_test(cds, neighbor_graph="principal_graph", cores=4)
pr_deg_ids <- row.names(subset(ciliated_cds_pr_test_res, q_value < 0.05))
plot_cells(cds, genes=c("hlh-4", "gcy-8", "dac-1", "oig-8"),
           show_trajectory_graph=FALSE,
           label_cell_groups=FALSE,
           label_leaves=FALSE)
### 6. 将沿单细胞轨迹图的差异表达基因进行模块聚类分析，共调节模块分析
gene_module_df <- find_gene_modules(cds[pr_deg_ids,], resolution=c(0,10^seq(-6,-1)))

cell_group_df <- tibble::tibble(cell=row.names(colData(cds)), 
                                cell_group=colData(cds)$cell.type)
# 聚合表达矩阵:行为module，列为cell.type
agg_mat <- aggregate_gene_expression(cds, gene_module_df, cell_group_df)
row.names(agg_mat) <- stringr::str_c("Module ", row.names(agg_mat))
# 可视化:热图
pheatmap::pheatmap(agg_mat,
                   scale="column", clustering_method="ward.D2")

# 比较不同module的基因聚合表达值
plot_cells(cds,
           genes=gene_module_df %>% filter(module %in% c(27, 10, 7, 30)),
           label_cell_groups=FALSE,
           show_trajectory_graph=FALSE)
# 特定基因在特定细胞类型中的伪时间轨迹图中的表达量作图
AFD_genes <- c("gcy-8", "dac-1", "oig-8")
AFD_lineage_cds <- cds[rowData(cds)$gene_short_name %in% AFD_genes,
                       colData(cds)$cell.type %in% c("AFD")]

plot_genes_in_pseudotime(AFD_lineage_cds,
                         color_cells_by="embryo.time.bin",
                         min_expr=0.5)

### 7. 3d单细胞伪时间轨迹图
cds_3d <- reduce_dimension(cds, max_components = 3)
cds_3d <- cluster_cells(cds_3d)
cds_3d <- learn_graph(cds_3d)
cds_3d <- order_cells(cds_3d, root_pr_nodes=get_earliest_principal_node(cds))

cds_3d_plot_obj <- plot_cells_3d(cds_3d, color_cells_by="partition")