10X单细胞（10X空间转录组）分析回顾之一些细节绘图操作

最新推荐文章于 2024-08-04 14:57:48 发布

追风少年ii

最新推荐文章于 2024-08-04 14:57:48 发布

阅读量256

点赞数 10

文章标签：空间转录组单细胞数据分析

本文链接：https://blog.csdn.net/weixin_53637133/article/details/138562529

版权

临近年底，我们要总结，马上回家了，大家今天到底挣了点什么？？反省一下，今天的自己，有没有更优秀，这一篇我给大家分享一些单细胞分析图片的细节操作，图片是我们分析结果的展示，要美观大方，就算我们过年要相亲了，自己也要打扮的精精神神的，😄

第一张图，Cluster之间的相关性

sce <- as.SingleCellExperiment(seurat)
reducedDim(sce, 'PCA_sub') <- reducedDim(sce, 'PCA')[,1:15, drop = FALSE]

ass_prob <- bootstrapCluster(sce, FUN = function(x) {
    g <- buildSNNGraph(x, use.dimred = 'PCA_sub')
    igraph::cluster_walktrap(g)$membership
  },
  clusters = sce$seurat_clusters
)

p <- ass_prob %>%
  as_tibble() %>%
  rownames_to_column(var = 'cluster_1') %>%
  pivot_longer(
    cols = 2:ncol(.),
    names_to = 'cluster_2',
    values_to = 'probability'
  ) %>%
  mutate(
    cluster_1 = as.character(as.numeric(cluster_1) - 1),
    cluster_1 = factor(cluster_1, levels = rev(unique(cluster_1))),
    cluster_2 = factor(cluster_2, levels = unique(cluster_2))
  ) %>%
  ggplot(aes(cluster_2, cluster_1, fill = probability)) +
  geom_tile(color = 'white') +
  geom_text(aes(label = round(probability, digits = 2)), size = 2.5) +
  scale_x_discrete(name = 'Cluster', position = 'top') +
  scale_y_discrete(name = 'Cluster') +
  scale_fill_gradient(
    name = 'Probability', low = 'white', high = '#c0392b', na.value = '#bdc3c7',
    limits = c(0,1),
    guide = guide_colorbar(
      frame.colour = 'black', ticks.colour = 'black', title.position = 'left',
      title.theme = element_text(hjust = 1, angle = 90),
      barwidth = 0.75, barheight = 10
    )
  ) +
  coord_fixed() +
  theme_bw() +
  theme(
    legend.position = 'right',
    panel.grid.major = element_blank()
  )

ggsave('plots/cluster_stability.png', p, height = 6, width = 7)

Silhouette plot

library(cluster)

distance_matrix <- dist(Embeddings(seurat[['pca']])[, 1:15])
clusters <- seurat@meta.data$seurat_clusters
silhouette <- silhouette(as.numeric(clusters), dist = distance_matrix)
seurat@meta.data$silhouette_score <- silhouette[,3]

mean_silhouette_score <- mean(seurat@meta.data$silhouette_score)

p <- seurat@meta.data %>%
  mutate(barcode = rownames(.)) %>%
  arrange(seurat_clusters,-silhouette_score) %>%
  mutate(barcode = factor(barcode, levels = barcode)) %>%
  ggplot() +
  geom_col(aes(barcode, silhouette_score, fill = seurat_clusters), show.legend = FALSE) +
  geom_hline(yintercept = mean_silhouette_score, color = 'red', linetype = 'dashed') +
  scale_x_discrete(name = 'Cells') +
  scale_y_continuous(name = 'Silhouette score') +
  scale_fill_manual(values = custom_colors$discrete) +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )
ggsave('plots/silhouette_plot.png', p, height = 4, width = 8)

Cluster tree

seurat <- BuildClusterTree(
  seurat,
  dims = 1:15,
  reorder = FALSE,
  reorder.numeric = FALSE
)

tree <- seurat@tools$BuildClusterTree
tree$tip.label <- paste0("Cluster ", tree$tip.label)

p <- ggtree::ggtree(tree, aes(x, y)) +
  scale_y_reverse() +
  ggtree::geom_tree() +
  ggtree::theme_tree() +
  ggtree::geom_tiplab(offset = 1) +
  ggtree::geom_tippoint(color = custom_colors$discrete[1:length(tree$tip.label)], shape = 16, size = 5) +
  coord_cartesian(clip = 'off') +
  theme(plot.margin = unit(c(0,2.5,0,0), 'cm'))

ggsave('plots/cluster_tree.png', p, height = 4, width = 6)

比例图

temp_labels <- seurat@meta.data %>%
  group_by(sample) %>%
  tally()

p1 <- table_samples_by_clusters %>%
  select(-c('total_cell_count')) %>%
  reshape2::melt(id.vars = 'sample') %>%
  mutate(sample = factor(sample, levels = levels(seurat@meta.data$sample))) %>%
  ggplot(aes(sample, value)) +
  geom_bar(aes(fill = variable), position = 'fill', stat = 'identity') +
  geom_text(
    data = temp_labels,
    aes(x = sample, y = Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1),
    color = 'black', size = 2.8
  ) +
  scale_fill_manual(name = 'Cluster', values = custom_colors$discrete) +
  scale_y_continuous(name = 'Percentage [%]', labels = scales::percent_format(), expand = c(0.01,0)) +
  coord_cartesian(clip = 'off') +
  theme_bw() +
  theme(
    legend.position = 'left',
    plot.title = element_text(hjust = 0.5),
    text = element_text(size = 16),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    plot.margin = margin(t = 20, r = 0, b = 0, l = 0, unit = 'pt')
  )

temp_labels <- seurat@meta.data %>%
  group_by(seurat_clusters) %>%
  tally() %>%
  dplyr::rename('cluster' = seurat_clusters)

p2 <- table_clusters_by_samples %>%
  select(-c('total_cell_count')) %>%
  reshape2::melt(id.vars = 'cluster') %>%
  mutate(cluster = factor(cluster, levels = levels(seurat@meta.data$seurat_clusters))) %>%
  ggplot(aes(cluster, value)) +
  geom_bar(aes(fill = variable), position = 'fill', stat = 'identity') +
  geom_text(
    data = temp_labels, aes(x = cluster, y = Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1),
    color = 'black', size = 2.8
  ) +
  scale_fill_manual(name = 'Sample', values = custom_colors$discrete) +
  scale_y_continuous(name = 'Percentage [%]', labels = scales::percent_format(), expand = c(0.01,0)) +
  coord_cartesian(clip = 'off') +
  theme_bw() +
  theme(
    legend.position = 'right',
    plot.title = element_text(hjust = 0.5),
    text = element_text(size = 16),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title = element_blank(),
    plot.margin = margin(t = 20, r = 0, b = 0, l = 10, unit = 'pt')
  )

ggsave(
  'plots/composition_samples_clusters_by_percent.png',
  p1 + p2 +
  plot_layout(ncol = 2, widths = c(
    seurat@meta.data$sample %>% unique() %>% length(),
    seurat@meta.data$seurat_clusters %>% unique() %>% length()
  )),
  width = 18, height = 8
)

temp_labels <- seurat@meta.data %>%
  group_by(sample) %>%
  tally()

p1 <- table_samples_by_cell_cycle %>%
  select(-c('total_cell_count')) %>%
  reshape2::melt(id.vars = 'sample') %>%
  mutate(sample = factor(sample, levels = levels(seurat@meta.data$sample))) %>%
  ggplot(aes(sample, value)) +
  geom_bar(aes(fill = variable), position = 'fill', stat = 'identity') +
  geom_text(
    data = temp_labels,
    aes(x = sample, y = Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1),
    color = 'black', size = 2.8
  ) +
  scale_fill_manual(name = 'Cell cycle', values = custom_colors$cell_cycle) +
  scale_y_continuous(name = 'Percentage [%]', labels = scales::percent_format(), expand = c(0.01,0)) +
  coord_cartesian(clip = 'off') +
  theme_bw() +
  theme(
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5),
    text = element_text(size = 16),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    plot.margin = margin(t = 20, r = 0, b = 0, l = 0, unit = 'pt')
  )

temp_labels <- seurat@meta.data %>%
  group_by(seurat_clusters) %>%
  tally() %>%
  dplyr::rename('cluster' = seurat_clusters)

p2 <- table_clusters_by_cell_cycle %>%
  select(-c('total_cell_count')) %>%
  reshape2::melt(id.vars = 'cluster') %>%
  mutate(cluster = factor(cluster, levels = levels(seurat@meta.data$seurat_clusters))) %>%
  ggplot(aes(cluster, value)) +
  geom_bar(aes(fill = variable), position = 'fill', stat = 'identity') +
  geom_text(
    data = temp_labels,
    aes(x = cluster, y = Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1),
    color = 'black', size = 2.8
  ) +
  scale_fill_manual(name = 'Cell cycle', values = custom_colors$cell_cycle) +
  scale_y_continuous(name = 'Percentage [%]', labels = scales::percent_format(), expand = c(0.01,0)) +
  coord_cartesian(clip = 'off') +
  theme_bw() +
  theme(
    legend.position = 'right',
    plot.title = element_text(hjust = 0.5),
    text = element_text(size = 16),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title = element_blank(),
    plot.margin = margin(t = 20, r = 0, b = 0, l = 10, unit = 'pt')
  )

ggsave(
  'plots/composition_samples_clusters_by_cell_cycle_by_percent.png',
  p1 + p2 +
  plot_layout(ncol = 2, widths = c(
    seurat@meta.data$sample %>% unique() %>% length(),
    seurat@meta.data$seurat_clusters %>% unique() %>% length()
  )),
  width = 18, height = 8
)

SNN graph

library(ggnetwork)

SCT_snn <- seurat@graphs$SCT_snn %>%
  as.matrix() %>%
  ggnetwork() %>%
  left_join(seurat@meta.data %>% mutate(vertex.names = rownames(.)), by = 'vertex.names')

p1 <- ggplot(SCT_snn, aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_edges(color = 'grey50', alpha = 0.05) +
  geom_nodes(aes(color = sample), size = 0.5) +
  scale_color_manual(
    name = 'Sample', values = custom_colors$discrete,
    guide = guide_legend(ncol = 1, override.aes = list(size = 2))
  ) +
  theme_blank() +
  theme(legend.position = 'left') +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

p2 <- ggplot(SCT_snn, aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_edges(color = 'grey50', alpha = 0.05) +
  geom_nodes(aes(color = seurat_clusters), size = 0.5) +
  scale_color_manual(
    name = 'Cluster', values = custom_colors$discrete,
    guide = guide_legend(ncol = 1, override.aes = list(size = 2))
  ) +
  theme_blank() +
  theme(legend.position = 'right') +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

ggsave(
  'plots/snn_graph_by_sample_cluster.png',
  p1 + p2 + plot_layout(ncol = 2),
  height = 5, width = 11
)

UMAP图

plot_umap_by_nCount <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = nCount_RNA)) +
  geom_point(size = 0.2) +
  theme_bw() +
  scale_color_viridis(
    guide = guide_colorbar(frame.colour = 'black', ticks.colour = 'black'),
    labels = scales::comma,
  ) +
  labs(color = 'Number of\ntranscripts') +
  theme(legend.position = 'left') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

plot_umap_by_sample <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = sample)) +
  geom_point(size = 0.2) +
  theme_bw() +
  scale_color_manual(values = custom_colors$discrete) +
  labs(color = 'Sample') +
  guides(colour = guide_legend(override.aes = list(size = 2))) +
  theme(legend.position = 'right') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

plot_umap_by_cluster <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = seurat_clusters)) +
  geom_point(size = 0.2) +
  theme_bw() +
  scale_color_manual(
    name = 'Cluster', values = custom_colors$discrete,
    guide = guide_legend(ncol = 2, override.aes = list(size = 2))
  ) +
  theme(legend.position = 'left') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

plot_umap_by_cell_cycle <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = cell_cycle_seurat)) +
  geom_point(size = 0.2) +
  theme_bw() +
  scale_color_manual(values = custom_colors$cell_cycle) +
  labs(color = 'Cell cycle') +
  guides(colour = guide_legend(override.aes = list(size = 2))) +
  theme(legend.position = 'right') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

ggsave(
  'plots/umap.png',
  plot_umap_by_nCount + plot_umap_by_sample +
  plot_umap_by_cluster + plot_umap_by_cell_cycle +
  plot_layout(ncol = 2),
  height = 6,
  width = 8.5
)

Alluvial plots

## get sample and cluster names
samples <- levels(seurat@meta.data$sample)
clusters <- levels(seurat@meta.data$seurat_clusters)

## create named vector holding the color assignments for both samples and
## clusters
color_assignments <- setNames(
  c(custom_colors$discrete[1:length(samples)], custom_colors$discrete[1:length(clusters)]),
  c(samples,clusters)
)

## prepare data for the plot; factor() calls are necessary for the right order
## of columns (first samples then clusters) and boxes within each column (
## cluster 1, 2, 3, ..., not 1, 10, 11, ...)
data <- seurat@meta.data %>%
  group_by(sample,seurat_clusters) %>%
  tally() %>%
  ungroup() %>%
  gather_set_data(1:2) %>%
  dplyr::mutate(
    x = factor(x, levels = unique(x)),
    y = factor(y, levels = unique(y))
  )

DataFrame(data)
# DataFrame with 114 rows and 6 columns
#       sample seurat_clusters         n        id               x        y
#     <factor>        <factor> <integer> <integer>        <factor> <factor>
# 1          A               0       301         1          sample        A
# 2          A               1       137         2          sample        A
# 3          A               2       121         3          sample        A
# 4          A               3       223         4          sample        A
# 5          A               4        78         5          sample        A
# ...      ...             ...       ...       ...             ...      ...
# 110        E              14        19        53 seurat_clusters       14
# 111        E              15        18        54 seurat_clusters       15
# 112        E              16        10        55 seurat_clusters       16
# 113        E              17         9        56 seurat_clusters       17
# 114        E              18        15        57 seurat_clusters       18

## create sample and cluster labels; hjust defines whether a label will be
## aligned to the right (1) or to the left (0); the nudge_x parameter is used
## to move the label outside of the boxes
data_labels <- tibble(
    group = c(
      rep('sample', length(samples)),
      rep('seurat_clusters', length(clusters))
    )
 ) %>%
  mutate(
    hjust = ifelse(group == 'sample', 1, 0),
    nudge_x = ifelse(group == 'sample', -0.1, 0.1)
  )

DataFrame(data_labels)
# DataFrame with 22 rows and 3 columns
#               group     hjust   nudge_x
#         <character> <numeric> <numeric>
# 1            sample         1      -0.1
# 2            sample         1      -0.1
# 3            sample         1      -0.1
# 4   seurat_clusters         0       0.1
# 5   seurat_clusters         0       0.1
# ...             ...       ...       ...
# 18  seurat_clusters         0       0.1
# 19  seurat_clusters         0       0.1
# 20  seurat_clusters         0       0.1
# 21  seurat_clusters         0       0.1
# 22  seurat_clusters         0       0.1

## create plot
p1 <- ggplot(data, aes(x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill = seurat_clusters), alpha = 0.75, axis.width = 0.15) +
  geom_parallel_sets_axes(aes(fill = y), color = 'black', axis.width = 0.1) +
  geom_text(
    aes(y = n, split = y), stat = 'parallel_sets_axes', fontface = 'bold',
    hjust = data_labels$hjust, nudge_x = data_labels$nudge_x
  ) +
  scale_x_discrete(labels = c('Sample','Cluster')) +
  scale_fill_manual(values = color_assignments) +
  theme_bw() +
  theme(
    legend.position = 'none',
    axis.title = element_blank(),
    axis.text.x = element_text(face = 'bold', colour = 'black', size = 15),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank()
  )

clusters <- levels(seurat@meta.data$seurat_clusters)
cell_types <- sort(unique(seurat@meta.data$cell_type_singler_blueprintencode_main))

color_assignments <- setNames(
  c(custom_colors$discrete[1:length(clusters)], custom_colors$discrete[1:length(cell_types)]),
  c(clusters,cell_types)
)

data <- seurat@meta.data %>%
  dplyr::rename(cell_type = cell_type_singler_blueprintencode_main) %>%
  dplyr::mutate(cell_type = factor(cell_type, levels = cell_types)) %>%
  group_by(seurat_clusters, cell_type) %>%
  tally() %>%
  ungroup() %>%
  gather_set_data(1:2) %>%
  dplyr::mutate(
    x = factor(x, levels = unique(x)),
    y = factor(y, levels = c(clusters,cell_types))
  )

data_labels <- tibble(
    group = c(
      rep('seurat_clusters', length(clusters)),
      rep('cell_type', length(cell_types))
    )
 ) %>%
  mutate(
    hjust = ifelse(group == 'seurat_clusters', 1, 0),
    nudge_x = ifelse(group == 'seurat_clusters', -0.1, 0.1)
  )

p2 <- ggplot(data, aes(x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill = seurat_clusters), alpha = 0.75, axis.width = 0.15) +
  geom_parallel_sets_axes(aes(fill = y), color = 'black', axis.width = 0.1) +
  geom_text(
    aes(y = n, split = y), stat = 'parallel_sets_axes', fontface = 'bold',
    hjust = data_labels$hjust, nudge_x = data_labels$nudge_x
  ) +
  scale_x_discrete(labels = c('Cluster','Cell type')) +
  scale_fill_manual(values = color_assignments) +
  theme_bw() +
  theme(
    legend.position = 'none',
    axis.title = element_blank(),
    axis.text.x = element_text(face = 'bold', colour = 'black', size = 15),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank()
  )

ggsave(
  'plots/samples_clusters_cell_types_alluvial.png',
  p1 + p2 + plot_layout(ncol = 2),
  height = 6, width = 8
)

UMAP by cell type

UMAP_centers_cell_type <- tibble(
    UMAP_1 = as.data.frame(seurat@reductions$UMAP@cell.embeddings)$UMAP_1,
    UMAP_2 = as.data.frame(seurat@reductions$UMAP@cell.embeddings)$UMAP_2,
    cell_type = seurat@meta.data$cell_type_singler_blueprintencode_main
  ) %>%
  group_by(cell_type) %>%
  summarize(x = median(UMAP_1), y = median(UMAP_2))

p <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = cell_type_singler_blueprintencode_main)) +
  geom_point(size = 0.2) +
  geom_label(
    data = UMAP_centers_cell_type,
    mapping = aes(x, y, label = cell_type),
    size = 3.5,
    fill = 'white',
    color = 'black',
    fontface = 'bold',
    alpha = 0.5,
    label.size = 0,
    show.legend = FALSE
  ) +
  theme_bw() +
  expand_limits(x = c(-22,15)) +
  scale_color_manual(values = custom_colors$discrete) +
  labs(color = 'Cell type') +
  guides(colour = guide_legend(override.aes = list(size = 2))) +
  theme(legend.position = 'right') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

ggsave(
  'plots/umap_by_cell_type_singler_blueprintencode_main.png',
  p,
  height = 4,
  width = 6
)

Dot plot (multiple genes)

# cells will be grouped by samples that they have been assigned to
group_ids <- unique(seurat@meta.data$cell_type_singler_blueprintencode_main)

# select a set of genes for which we want to show expression
genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$cell_type_singler_blueprintencode_main %>%
  group_by(cell_type_singler_blueprintencode_main) %>%
  arrange(p_val_adj) %>%
  filter(row_number() == 1) %>%
  arrange(cell_type_singler_blueprintencode_main) %>%
  pull(gene)

# for every sample-gene combination, calculate the average expression across
# all cells and then transform the data into a data frame
expression_levels_per_group <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which(seurat@meta.data$cell_type_singler_blueprintencode_main == x)
      Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group])
    }
  ) %>%
  t() %>%
  as.data.frame() %>%
  mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>%
  select(cell_type_singler_blueprintencode_main, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(expression = value) %>%
  mutate(id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene))

# for every sample-gene combination, calculate the percentage of cells in the
# respective group that has at least 1 transcript (this means we consider it
# as expressing the gene) and then transform the data into a data frame
percentage_of_cells_expressing_gene <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which(seurat@meta.data$cell_type_singler_blueprintencode_main == x)
      Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0)
    }
  ) %>%
  t() %>%
  as.data.frame() %>%
  mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>%
  select(cell_type_singler_blueprintencode_main, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(cell_count = value) %>%
  left_join(
    .,
    seurat@meta.data %>%
      group_by(cell_type_singler_blueprintencode_main) %>%
      tally(),
    by = 'cell_type_singler_blueprintencode_main') %>%
  mutate(
    id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene),
    percent_cells = cell_count / n
  )

# merge the two data frames created before and plot the data
p <- left_join(
    expression_levels_per_group,
    percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells),
    by = 'id_to_merge'
  ) %>%
  mutate(
    cell_type_singler_blueprintencode_main = factor(cell_type_singler_blueprintencode_main, levels = rev(group_ids)),
    gene = factor(gene, levels = genes_to_show)
  ) %>%
  arrange(gene) %>%
  ggplot(aes(gene, cell_type_singler_blueprintencode_main)) +
  geom_point(aes(color = expression, size = percent_cells)) +
  scale_color_distiller(
    palette = 'Reds',
    direction = 1,
    name = 'Log-normalised\nexpression',
    guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
  ) +
  scale_size(name = 'Percent\nof cells', labels = scales::percent) +
  labs(y = 'Cell type', color = 'Expression') +
  coord_fixed() +
  theme_bw() +
  theme(
    axis.title = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

ggsave('plots/marker_genes_by_cell_type_as_dot_plot.png', p, height = 5, width = 6)

group_ids <- levels(seurat@meta.data$seurat_clusters)

genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$seurat_clusters %>%
  group_by(seurat_clusters) %>%
  arrange(p_val_adj) %>%
  filter(row_number() == 1) %>%
  arrange(seurat_clusters) %>%
  pull(gene)

expression_levels_per_group <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which(seurat@meta.data$seurat_clusters == x)
      Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group])
    }
  ) %>%
  t() %>%
  as.data.frame() %>%
  mutate(cluster = rownames(.)) %>%
  select(cluster, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(expression = value) %>%
  mutate(id_to_merge = paste0(cluster, '_', gene))

percentage_of_cells_expressing_gene <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which(seurat@meta.data$seurat_cluster == x)
      Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0)
    }
  ) %>%
  t() %>%
  as.data.frame() %>%
  mutate(cluster = rownames(.)) %>%
  select(cluster, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(cell_count = value) %>%
  left_join(
    .,
    seurat@meta.data %>%
      group_by(seurat_clusters) %>%
      tally() %>%
      dplyr::rename(cluster = seurat_clusters),
    by = 'cluster') %>%
  mutate(
    id_to_merge = paste0(cluster, '_', gene),
    percent_cells = cell_count / n
  )

p <- left_join(
    expression_levels_per_group,
    percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells),
    by = 'id_to_merge'
  ) %>%
  mutate(
    cluster = factor(cluster, levels = rev(group_ids)),
    gene = factor(gene, levels = genes_to_show)
  ) %>%
  arrange(gene) %>%
  ggplot(aes(gene, cluster)) +
  geom_point(aes(color = expression, size = percent_cells)) +
  scale_color_distiller(
    palette = 'Reds',
    direction = 1,
    name = 'Log-normalised\nexpression',
    guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
  ) +
  scale_size(name = 'Percent\nof cells', labels = scales::percent) +
  labs(y = 'Cluster', color = 'Expression') +
  coord_fixed() +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

ggsave('plots/marker_genes_by_cluster_as_dot_plot.png', p, height = 7, width = 8)

Gene set enrichment analysis

# function to read GMT file
read_GMT_file <- function(file) {
  gmt <- readr::read_delim(
    file,
    delim = ';',
    col_names = c('X1'),
    col_types = readr::cols()
  )

  gene_set_genes <- list()
  for ( i in seq_len(nrow(gmt)) )
  {
    temp_genes <- strsplit(gmt$X1[i], split = '\t')[[1]] %>% unlist()
    temp_genes <- temp_genes[3:length(temp_genes)]
    gene_set_genes[[i]] <- temp_genes
  }
  gene_set_loaded <- list(
    genesets = gene_set_genes,
    geneset.names = lapply(strsplit(gmt$X1, split = '\t'), '[', 1) %>% unlist(),
    geneset.description = lapply(
        strsplit(gmt$X1, split = '\t'), '[', 2
      ) %>% unlist()
  )

  return(gene_set_loaded)
}

# load gene sets from GMT file
gene_sets <- read_GMT_file('h.all.v7.2.symbols.gmt')

# set gene set names
names(gene_sets$genesets) <- gene_sets$geneset.names

# get indices of cells which are either HSC or monocytes
cells_to_analyze <- seurat@meta.data %>%
  mutate(row_number = row_number()) %>%
  filter(grepl(cell_type_singler_blueprintencode_main, pattern = 'HSC|Monocytes')) %>%
  arrange(cell_type_singler_blueprintencode_main) %>%
  pull(row_number)

# get list of genes unique genes across all gene sets
genes_to_analyze <- gene_sets$genesets %>% unlist() %>% unique()
# filter gene list for those which are present in the data set
genes_to_analyze <- genes_to_analyze[which(genes_to_analyze %in% rownames(seurat@assays$SCT@counts))]

# get expression matrix and reduce it to cells and genes of interest
expression_matrix <- seurat@assays$SCT@counts[ genes_to_analyze , cells_to_analyze] %>% as.matrix()

# perform GSVA
gsva <- GSVA::gsva(
  expression_matrix,
  gset.idx.list = gene_sets$genesets,
  parallel.sz = 1
)

# load limma library for statistical testing
library(limma)

# generate design matrix
design_matrix <- tibble(
  control = 1,
  test = c(
    rep(0, seurat@meta.data %>% filter(cell_type_singler_blueprintencode_main == 'HSC') %>% nrow()),
    rep(1, seurat@meta.data %>% filter(cell_type_singler_blueprintencode_main == 'Monocytes') %>% nrow())
  )
)

head(design_matrix)
# A tibble: 6 x 2
#   control  test
#     <dbl> <dbl>
# 1       1     0
# 2       1     0
# 3       1     0
# 4       1     0
# 5       1     0
# 6       1     0

# fit linear model, followed by empirical Bayes statistics for differential
# enrichment analysis
fit <- lmFit(gsva, design_matrix)
fit <- eBayes(fit)

# prepare data for plotting
data <- topTable(fit, coef = 'test', number = 50) %>%
  mutate(gene_set = rownames(fit$t)) %>%
  arrange(t) %>%
  mutate(
    gene_set = factor(gene_set, levels = gene_set),
    just = ifelse(t < 0, 0, 1),
    nudge_y = ifelse(t < 0, 1, -1),
    color = ifelse(t < -5 | t > 5, 'black', 'grey')
  )

DataFrame(data)
# DataFrame with 50 rows and 10 columns
#         logFC     AveExpr         t      P.Value    adj.P.Val         B                            gene_set      just   nudge_y       color
#     <numeric>   <numeric> <numeric>    <numeric>    <numeric> <numeric>                            <factor> <numeric> <numeric> <character>
# 1   -0.369443  -0.1569690  -35.1619 1.26066e-186 1.05055e-185   415.980  HALLMARK_TGF_BETA_SIGNALING                0         1       black
# 2   -0.251413  -0.0820012  -27.2261 1.96152e-127 8.91598e-127   279.761  HALLMARK_NOTCH_SIGNALING                   0         1       black
# 3   -0.288169  -0.1202293  -26.1464 1.40712e-119 5.41201e-119   261.689  HALLMARK_ESTROGEN_RESPONSE_EARLY           0         1       black
# 4   -0.135034  -0.1511146  -22.3565  8.50194e-93  2.36165e-92   200.094  HALLMARK_INTERFERON_ALPHA_RESPONSE         0         1       black
# 5   -0.203651  -0.1049498  -22.0728  7.44006e-91  1.95791e-90   195.628  HALLMARK_INTERFERON_GAMMA_RESPONSE         0         1       black
# ...       ...         ...       ...          ...          ...       ...                                 ...       ...       ...         ...
# 46   0.200259  0.25341594   35.9250 2.31901e-192 2.31901e-191   429.182 HALLMARK_WNT_BETA_CATENIN_SIGNALING         1        -1       black
# 47   0.293113 -0.08115610   36.1995 2.00784e-194 2.50980e-193   433.929 HALLMARK_MITOTIC_SPINDLE                    1        -1       black
# 48   0.338449  0.11583774   36.2560 7.56196e-195 1.26033e-193   434.906 HALLMARK_CHOLESTEROL_HOMEOSTASIS            1        -1       black
# 49   0.251678  0.00262136   37.5117 2.86772e-204 7.16930e-203   456.592 HALLMARK_HYPOXIA                            1        -1       black
# 50   0.282907 -0.05021404   48.5971 1.72523e-285 8.62616e-284   643.571 HALLMARK_TNFA_SIGNALING_VIA_NFKB            1        -1       black

# plot t-value
p <- ggplot(data = data, aes(x = gene_set, y = t, fill = t)) +
  geom_col() +
  geom_hline(yintercept = c(-5,5), linetype = 'dashed', color = 'grey80') +
  geom_text(
    aes(
      x = gene_set,
      y = 0,
      label = gene_set,
      hjust = just,
      color = color
    ),
    nudge_y = data$nudge_y, size = 3
  ) +
  scale_x_discrete(name = '', labels = NULL) +
  scale_y_continuous(name = 't-value', limits = c(-55,55)) +
  scale_fill_distiller(palette = 'Spectral', limits = c(-max(data$t), max(data$t))) +
  scale_color_manual(values = c('black' = 'black', 'grey' = 'grey')) +
  coord_flip() +
  theme_bw() +
  theme(
    panel.grid = element_blank(),
    axis.ticks.y =  element_blank(),
    legend.position = 'none'
  )

ggsave('plots/gsva_hallmark_gene_sets_monocytes_vs_hsc.png', height = 7.5, width = 7.5)

通讯热图

library(pheatmap)
library(argparse)

heatmap <- function (Matrix,labels){
     plot_heatmap <- pheatmap(Matrix,scale='none',cluster_cols = F,cluster_rows = F,cellwidth = 20,cellheight = 20,color = colorRampPalette(colors = c("white","red"))(100),display_numbers=labels)
     return(plot_heatmap)
}

getSig <- function(dc) {
if(dc > pvalue){sc <- ""}
if(dc < pvalue){sc <- "*"}}

parser = ArgumentParser()
parser$add_argument("--LR_score", help="the file of sample.LR.score.xls",required =T)
parser$add_argument("--pvalue", help="the file of sample.LR.score.xls",default = '0.1')
parser$add_argument("--outdir", help="the outdir",default='.')
args <- parser$parse_args()
str(args)

LR_score = args$LR_score
pvalue = args$pvalue
outdir = args$outdir

pvalue <- as.numeric(pvalue)

if(!dir.exists(outdir)){
  dir.create(outdir)
}

setwd(outdir)
file <- read.table(LR_score,sep='\t',header=T,row.names=1,check.names=F)
len <- as.numeric(length(colnames(file)))
Max <- max(as.numeric(sapply(strsplit(colnames(file),'_'),"[",2)[1:len/2]))

for (i in 1:Max){
index_score <- as.array(which(sapply(strsplit(colnames(file),'_'),"[",1) == paste0('cluster',as.character(i))))
index_pvalue <- as.array(which(sapply(strsplit(colnames(file),'_'),"[",2) == paste0('pvalue',as.character(i))))
index <- c(index_score,index_pvalue)
Matrix <- file[,index]
LR <- rownames(as.matrix(sort(apply(Matrix,1,sum),decreasing=T)[1:30]))
Matrix_LR <- Matrix[LR,]
label <- as.matrix(sapply(Matrix_LR[,as.numeric(Max+1):as.numeric(length(colnames(Matrix_LR)))],function(x){ifelse(x>pvalue,sc<-"",sc<-"*")}),ncol=Max)
pdf(paste0('cluster',i,'_others_LR.pdf'),width=6,height=14)
heatmap(Matrix_LR[,1:Max],label)
dev.off()
png(paste0('cluster',i,'_others_LR.png'),width=6*200,height=14*200,res=200,type = 'cario-png')
dev.off()
}