Li——01

天不怕地不怕只怕代码

已于 2022-04-14 19:40:10 修改

阅读量499

点赞数 2

文章标签： r语言

于 2022-04-07 16:03:33 首次发布

本文链接：https://blog.csdn.net/yayiling/article/details/124014694

版权

library(Seurat)
library(dplyr)
library(ggplot2)
library(cowplot)
library(rlang)
library(clustree)
library(patchwork)
library(ggraph)
library(ggsci)
library(DoubletFinder)

1、Initialize Data

raw.s03.data <- Read10X("~/Li/s03",gene.column=1)
raw.s04.data <- Read10X("~/Li/s04",gene.column=1)
##创建Seurat对象
raw.s03 <- CreateSeuratObject(counts= raw.s03.data)
raw.s03
##An object of class Seurat
##31897 features across 2853 samples within 1 assay
##Active assay: RNA (31897 features, 0 variable features)
dim(raw.s03)
# 21358 2853
raw.s04 <- CreateSeuratObject(counts= raw.s04.data)
raw.s04
##An object of class Seurat
##33168 features across 2863 samples within 1 assay
##Active assay: RNA (33168 features, 0 variable features)

2、QC

2.1 remove Doublets

# [s03的过滤]
library(DoubletFinder)
raw.s03 <- SCTransform(raw.s03, verbose = F) %>% RunPCA(npcs = 30, verbose = F)
# pdf("Li/figures/elbow-10X-s03.pdf")
# ElbowPlot(raw.s03,ndims = 30)
# dev.off()
# 选定dims为25

raw.s03 <- RunUMAP(raw.s03, dims=1:25,verbose = F) %>% FindNeighbors(reduction = "pca", dims = 1:25)
# saveRDS(s03,"~/Li/data")
# checkRes <- FindClusters(object = raw.s03, resolution = c(seq(.1,1.4,.2)), verbose = F)
# pdf(file = "~/Li/figures/QC-checkres-s03.pdf", height = 8, width = 10, onefile = TRUE)
# clustree(checkRes@meta.data,prefix = "SCT_snn_res.")
# dev.off()
raw.s03<- FindClusters(raw.s03, resolution = 0.7)
# Optimize the parameters
sweep.res.list.s03 <- paramSweep_v3(raw.s03, PCs = 1:25, sct = T)
# Use log transform
sweep.stats.s03 <- summarizeSweep(sweep.res.list.s03, GT = F)
# save(sweep.stats, file = "data/excludeP4B.obj/clean.data/sweepStats-10X.RData")
# Show the best parameter
bcmvn.s03 <- find.pK(sweep.stats.s03)
# Extract the best pK
pK_bcmvn.s03 <- bcmvn.s03$pK[which.max(bcmvn.s03$BCmetric)] %>% as.character() %>% as.numeric()
pK_bcmvn.s03
## [1] 0.01
# Estimate the percentage of homotypic doublets
homotypic.prop.s03 <- modelHomotypic(raw.s03$seurat_clusters)
# 10x的双细胞率为0.9%/1000 cell= 0.009/1000*3000=0.027
DoubletRate <- 0.027
nExp_poi.s03 <- round(DoubletRate*ncol(raw.s03)) 
# Adjust for homotypic doublets
nExp_poi.adj.s03 <- round(nExp_poi.s03*(1-homotypic.prop.s03))
raw.s03 <- doubletFinder_v3(raw.s03, PCs = 1:25, pN = 0.25, pK = pK_bcmvn.s03, nExp = nExp_poi.adj.s03, reuse.pANN = F, sct = T)
# saveRDS(raw.s03, file = "~/Li/data/raw.s03-db.RDS")
# Present the res, classification info is saved in meta.data
DF.name.s03 <- colnames(raw.s03@meta.data)[grepl("DF.classification", colnames(raw.s03@meta.data))]
# Visualization
# pdf(file = "~/Li/figures/doublet-res.s03.pdf", height = 5, width = 12, onefile = TRUE)
cowplot::plot_grid(ncol = 2, DimPlot(raw.s03, group.by = "orig.ident") + NoAxes(),DimPlot(raw.s03, group.by = DF.name.s03) + NoAxes())

raw.s03.singlet <- raw.s03[, raw.s03@meta.data[, DF.name.s03] == "Singlet"]
#saveRDS(raw.s03.singlet,"~/Li/data/raw.s03.singlet")
raw.s03.singlet
# An object of class Seurat
# 53255 features across 2785 samples within 2 assays
# Active assay: SCT (21358 features, 3000 variable features)
# 1 other assay present: RNA
# 2 dimensional reductions calculated: pca, umap

2.2 Calculate QC

# mitochondrial ratio
raw.s03.singlet[["percent.mt"]] <- PercentageFeatureSet(object = raw.s03.singlet, pattern = "^MT-", assay = "RNA")
# number of genes detected per UMI - represent the complexity of our dataset (more genes detected per UMI, more complex our data)
raw.s03.singlet[["log10GenesPerUMI"]] <- log10(raw.s03.singlet$nFeature_RNA) / log10(raw.s03.singlet$nCount_RNA)
# 可视化质控指标
# 一般可视化以下三个参数即可："nFeature_RNA", "nCount_RNA", "percent.mt"
VlnPlot(raw.s03.singlet, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size=0.05)

# 2.2.1 Cell-level filtering-------------------------------------------------
# 筛选参考阈值
# nUMI > 500
# nGene > 250
# log10GenesPerUMI > 0.8
# mitoRatio < 0.2
filteredData_s03 <- subset(x = raw.s03.singlet, 
                         subset= (nCount_RNA >= 500) & 
                           (nFeature_RNA >= 250) & 
                           (log10GenesPerUMI > 0.80) & 
                           (percent.mt > 0.20) &
						   (percent.mt < 30))
filteredData_s03
# An object of class Seurat
# 53255 features across 2721 samples within 2 assays
# Active assay: SCT (21358 features, 3000 variable features)
# 1 other assay present: RNA
# 2 dimensional reductions calculated: pca, umap
dim(filteredData_s03)
# 21358 2721

# 2.2.2 Gene-level filtering-------------------------------------------------
# 首先，我们将去除所有细胞中零表达的基因。此外，我们将根据流行程度执行一些过滤。如果一个基因只在少数细胞中表达，它就没有特别的意义。对于我们的数据，我们选择只保留在3个或3个以上细胞中表达的基因
# Extract counts
counts.s03 <- GetAssayData(object = filteredData_s03, slot = "counts", assay = "RNA")
# Output a logical vector for every gene on whether the more than zero counts per cell
nonzero.s03 <- counts.s03 > 0
# Sums all TRUE values and returns TRUE if more than 3 TRUE values per gene
keep_genes.s03 <- Matrix::rowSums(nonzero.s03) >= 3
# Only keeping those genes expressed in more than 3 cells
filtered_counts.s03 <- counts.s03[keep_genes.s03, ]
# Reassign to filtered Seurat object
clean.s03 <- CreateSeuratObject(filtered_counts.s03, meta.data = filteredData_s03@meta.data)
# clean.s03
# An object of class Seurat
# 23511 features across 2721 samples within 1 assay
# Active assay: RNA (23511 features, 0 variable features)
dim(clean.s03)
# 23511 2721
# saveRDS(clean.s03, file = "~/Li/data/clean.s03.RDS")

# 【s04的过滤】
#2.2remove Doublets==================================================
library(DoubletFinder)
raw.s04 <- SCTransform(raw.s04, verbose = F) %>% RunPCA(npcs = 30, verbose = F)
# pdf("Li/figures/elbow-10X-s04.pdf")
# ElbowPlot(raw.s04,ndims = 30)
# dev.off()
# 选定dims为25
raw.s04 <- RunUMAP(raw.s04, dims=1:25,verbose = F) %>% FindNeighbors(reduction = "pca", dims = 1:25)
# saveRDS(raw.s04,"~/Li/data/raw.s04")
# checkRes <- FindClusters(object = raw.s04, resolution = c(seq(.1,1.4,.2)), verbose = F)
# pdf(file = "~/Li/figures/QC-checkres-s04.pdf", height = 8, width = 10, onefile = TRUE)
# clustree(checkRes@meta.data,prefix = "SCT_snn_res.")
# dev.off()
raw.s04<- FindClusters(raw.s04, resolution = 0.7)
# Optimize the parameters
sweep.res.list.s04 <- paramSweep_v3(raw.s04, PCs = 1:25, sct = T)
# Use log transform
sweep.stats.s04 <- summarizeSweep(sweep.res.list.s04, GT = F)
# save(sweep.stats, file = "data/excludeP4B.obj/clean.data/sweepStats-10X.RData")
# Show the best parameter
bcmvn.s04 <- find.pK(sweep.stats.s04)
# Extract the best pK
pK_bcmvn.s04 <- bcmvn.s04$pK[which.max(bcmvn.s04$BCmetric)] %>% as.character() %>% as.numeric()
pK_bcmvn.s04
## [1] 0.04
# Estimate the percentage of homotypic doublets
homotypic.prop.s04 <- modelHomotypic(raw.s04$seurat_clusters)
# 10x的双细胞率为0.9%/1000 cell= 0.009/1000
*3000=0.027
DoubletRate <- 0.027
nExp_poi.s04 <- round(DoubletRate*ncol(raw.s04)) 
# Adjust for homotypic doublets
nExp_poi.adj.s04 <- round(nExp_poi.s04*(1-homotypic.prop.s04))
raw.s04 <- doubletFinder_v3(raw.s04, PCs = 1:25, pN = 0.25, pK = pK_bcmvn.s04, nExp = nExp_poi.adj.s04, reuse.pANN = F, sct = T)
saveRDS(raw.s04, file = "~/Li/data/raw.s04-db.RDS")
# Present the res, classification info is saved in meta.data
DF.name.s04 <- colnames(raw.s04@meta.data)[grepl("DF.classification", colnames(raw.s04@meta.data))]
# Visualization
# pdf(file = "~/Li/figures/doublet-res.s04.pdf", height = 5, width = 12, onefile = TRUE)
cowplot::plot_grid(ncol = 2, DimPlot(raw.s04, group.by = "orig.ident") + NoAxes(),DimPlot(raw.s04, group.by = DF.name.s04) + NoAxes())
raw.s04.singlet <- raw.s04[, raw.s04@meta.data[, DF.name.s04] == "Singlet"]
saveRDS(raw.s04.singlet,"~/Li/data/raw.s04.singlet")
raw.s04.singlet
# An object of class Seurat
# 53255 features across 2795 samples within 2 assays
# Active assay: SCT (22377 features, 3000 variable features)
# 1 other assay present: RNA
# 2 dimensional reductions calculated: pca, umap


# 2.3 Calculate QC-------------------------------------------------------------
# mitochondrial ratio
raw.s04.singlet[["percent.mt"]] <- PercentageFeatureSet(object = raw.s04.singlet, pattern = "^MT-", assay = "RNA")
# number of genes detected per UMI - represent the complexity of our dataset (more genes detected per UMI, more complex our data)
raw.s04.singlet[["log10GenesPerUMI"]] <- log10(raw.s04.singlet$nFeature_RNA) / log10(raw.s04.singlet$nCount_RNA)
# 可视化质控指标
# 一般可视化以下三个参数即可："nFeature_RNA", "nCount_RNA", "percent.mt"
VlnPlot(raw.s04.singlet, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size=0.05)

# 2.3.1 Cell-level filtering-------------------------------------------------
# 筛选参考阈值
# nUMI > 500
# nGene > 250
# log10GenesPerUMI > 0.8
# mitoRatio < 0.2
filteredData_s04 <- subset(x = raw.s04.singlet, 
                         subset= (nCount_RNA >= 500) & 
                           (nFeature_RNA >= 250) & 
                           (log10GenesPerUMI > 0.80) & 
                           (percent.mt > 0.20) &
						   (percent.mt < 30))
filteredData_s04
# An object of class Seurat
# 55545 features across 2779 samples within 2 assays
# Active assay: SCT (22377 features, 3000 variable features)
# 1 other assay present: RNA
# 2 dimensional reductions calculated: pca, umap

dim(filteredData_s04)
# 22377 2779

# 2.3.2 Gene-level filtering-------------------------------------------------
# 首先，我们将去除所有细胞中零表达的基因。此外，我们将根据流行程度执行一些过滤。如果一个基因只在少数细胞中表达，它就没有特别的意义。对于我们的数据，我们选择只保留在3个或3个以上细胞中表达的基因
# Extract counts
counts.s04 <- GetAssayData(object = filteredData_s04, slot = "counts", assay = "RNA")
# Output a logical vector for every gene on whether the more than zero counts per cell
nonzero.s04 <- counts.s04 > 0
# Sums all TRUE values and returns TRUE if more than 3 TRUE values per gene
keep_genes.s04 <- Matrix::rowSums(nonzero.s04) >= 3
# Only keeping those genes expressed in more than 3 cells
filtered_counts.s04<- counts.s04[keep_genes.s04, ]
# Reassign to filtered Seurat object
clean.s04 <- CreateSeuratObject(filtered_counts.s04, meta.data = filteredData_s04@meta.data)
# clean.s04
# An object of class Seurat
# 24825 features across 2779 samples within 1 assay
# Active assay: RNA (23511 features, 0 variable features)
dim(clean.s04)
# 24825 2779
saveRDS(clean.s04, file = "~/Li/data/clean.s04.RDS")
clean.s03$source <- "WT"
clean.s04$source <- "WT+h"

3、 Integration

3.1 SCTransform

mergedData <- merge(x= clean.s03, y= clean.s04)
mergedData <- SCTransform(mergedData, vars.to.regress = c("percent.mt"), variable.features.n = 3000, verbose = F, seed.use = 27) %>% 
              RunPCA(npcs = 30, verbose = FALSE)
pdf("~/Li/figures/merged-elbow.pdf")
ElbowPlot(mergedData,ndims = 30)
dev.off()
# Set dim 1:30

pc.num=1:30
mergedData@meta.data <- select(mergedData@meta.data, -one_of("nCount_SCT","nFeature_SCT","SCT_snn_res.0.7","pANN_0.25_0.01_68","seurat_clusters","pANN_0.25_0.04_68","pANN_0.24_0.01_68","DF.classifications_0.25_0.04_68","DF.classifications_0.25_0.01_68"))
head(mergedData@meta.data) 
# Run UMAP
mergedData <- RunUMAP(mergedData, dims = 1:30, reduction = "pca") %>% RunTSNE(dims = 1:30, check_duplicates=FALSE)

3.2 Seurat Integration

split_seurat <- SplitObject(mergedData, split.by="source")
# Select the most variable features to use for integration
integ_features <- SelectIntegrationFeatures(object.list=split_seurat,nfeatures=3000)
# Prepare the SCT list object for integration
split_seurat <- PrepSCTIntegration(object.list = split_seurat,
								   anchor.features = integ_features)
# Perform CCA, find the best buddies or anchors and filter incorrect anchors.  
# Find best buddies - can take a while to run
integ_anchors <- FindIntegrationAnchors(object.list = split_seurat, 
                                        normalization.method = "SCT", 
                                        anchor.features = integ_features)
# Integrate across conditions
integrated.s <- IntegrateData(anchorset = integ_anchors, normalization.method = "SCT")
integrated.s <- RunPCA(integrated.s, npcs = 30, verbose = F) %>% RunUMAP(dims = 1:30, reduction = "pca") %>% RunTSNE(dims = 1:30)
integrated.s
#An object of class Seurat
#52465 features across 5500 samples within 3 assays
#Active assay: integrated (3000 features, 3000 variable features)
# 2 other assays present: RNA, SCT
# 3 dimensional reductions calculated: pca, umap, tsne

pdf("~/Li/figures/integrated_umap.pdf")
DimPlot(integrated.s, group.by = "source", reduction = "umap") 
dev.off()

4、Clustering cells based on top PCs

# Determine the K-nearest neighbor graph
integrated.cluster <- FindNeighbors(object = integrated.s, dims = 1:30)
checkRes <- FindClusters(object = integrated.cluster, resolution = c(seq(0.1,0.9,0.1)), verbose = F)
# pdf(file = "~/Li/figures/integrated-checkres.pdf", height = 8, width = 10, onefile = TRUE)
clustree(checkRes@meta.data,prefix = "integrated_snn_res.")
#dev.off()
integrated.cluster <- FindClusters(integrated.cluster, resolution = 0.2)
# saveRDS(integrated.cluster, file = "~/Li/data/integrated.data.RDS")
#pdf("~/Li/figures/integrated.cluster.pdf")
#DimPlot(object = integrated.cluster, group.by = "seurat_clusters", reduction = "umap",split.by = "source") 
#dev.off()

5、Find Cluster Biomarkers

#All markers
markers <- FindAllMarkers(object = integrated.cluster, only.pos = TRUE, min.pct = 0.20, thresh.use = 0.25)
# save(markers, file = "~/Li/data/markers.RData")
top300 <-  markers %>% group_by(cluster) %>% top_n(300, avg_log2FC)
write.csv(markers, file = "~/Li/data/Allmarkers.csv")
pdf(file = "~/Li/figures/markers-top300-heatmap.pdf", height = 200, width = 10, onefile = TRUE)
DoHeatmap(object = integrated.cluster, features = top300$gene, label = TRUE,group.by = "seurat_clusters")
dev.off()

#加了一列细胞类型及刺激状态
integrated.cluster$celltype.stim <- paste(Idents(integrated.cluster), integrated.cluster$source, sep = "_") 
#加了一列细胞类型
integrated.cluster$celltype <- Idents(integrated.cluster) 
#将细胞类型及刺激状态作为分组
Idents(integrated.cluster) <- "celltype.stim" 

#使用FindMarkers函数寻找差异表达基因 
all.markers <- FindMarkers(integrated.cluster, ident.1 = "0_WT+h", ident.2 = "0_WT", verbose = FALSE)
write.csv(all.markers,"~/Li/data/all_markers.csv")


#beta cells -WT and WT+h
#cluster0 <- subset(integrated.cluster,seurat_clusters==0)
beta.markers <- FindMarkers(integrated.cluster, ident.1 = "0_WT+h", ident.2 = "0_WT", verbose = FALSE)
write.csv(beta.markers,"~/Li/data/beta_markers.csv")

#alpha cells -WT and WT+h
#cluster1 <- subset(integrated.cluster,seurat_clusters==1)
alpha.markers <- FindMarkers(integrated.cluster, ident.1 = "1_WT", ident.2 = "1_WT+h", verbose = FALSE)
write.csv(alpha.markers,"~/Li/data/alpha_markers.csv")


#delta cells -WT and WT+h
#delta cells -WT and WT+h
delta.markers <- FindMarkers(integrated.cluster, ident.1 = "4_WT+h", ident.2 = "4_WT", verbose = FALSE)
write.csv(delta.markers,"~/Li/data/delta_markers.csv")

#neuroendocrine cells
ne.markers <- FindMarkers(integrated.cluster, ident.1 = "2_WT+h", ident.2 = "2_WT", verbose = FALSE)
write.csv(ne.markers,"~/Li/data/neuroendocrine_markers.csv")

#progenitor cells
pro.markers <-FindMarkers(integrated.cluster, ident.1 = "3_WT+h", ident.2 = "3_WT", verbose = FALSE)
write.csv(pro.markers,"~/Li/data/progenitor_markers.csv")

#endocrine cell
end.markers <-FindMarkers(integrated.cluster, ident.1 = "5_WT+h", ident.2 = "5_WT", verbose = FALSE)
write.csv(end.markers,"~/Li/data/endocrine_markers.csv")

#pancreatic polypeptide cell
pan.markers<-FindMarkers(integrated.cluster, ident.1 = "6_WT+h", ident.2 = "6_WT", verbose = FALSE)
write.csv(pan.markers,"~/Li/data/pancreatic_markers.csv")


# 6、Annotation
# β marker：❤INS、NKX6.1、PFKFB2、PDX1、DLK1、NPTX2、Insulin、SYT13 √
# α marker：❤GCG、PLCE1、LOXL4、ARX、IRX2、HMGB3                    √
# Acinar marker：PRSS1
# PP marker: 
# δ marker: ❤SST                                                    √
# cluster0_INS:beta cell   cluster1_GCG:alpha cell  cluster2:CHGA ：Neuroendocrine cell cluster4:LAMA1 ：progenitor cell  cluster5_SST:delta cell  cluster3:SCG2:endocrine cell  cluster6：GHRL:pancreatic polypeptide cell  cluster7：TUBB
# pancreatic progenitor cells:PDX1, NKX6-1, NEUROG3

table(integrated.cluster$seurat_clusters)
#  0    1    2    3    4    5    6    7
#2255 1941  503  245  240  197   80   39

cluster.meta <-integrated.cluster@meta.data
cluster.meta <- arrange(cluster.meta,cluster.meta[,"seurat_clusters"])
indx <- factor(cluster.meta$seurat_clusters, levels = c(0,1,2,3,4,5,6,7)) 
cluster.meta <- cluster.meta[order(indx), ]

cluster.meta$CellType <- c(rep("beta cell_INS",2255),rep("alpha cell_GCG",1941),rep("Neuroendocrine cell_CHGA",503),rep("endocrine cell_SCG2",245),rep("Neuroendocrine cell_LAMA1",240),rep("delta cell_SST",197),rep("pancreatic polypeptide cell_GHRL",80),rep("unknown",39))

integrated.cluster <- AddMetaData(integrated.cluster,cluster.meta)
# saveRDS(integrated.cluster, file = "~/Li/data/integrated-anno.RDS")
pdf("~/Li/figures/anno.pdf",height=5,width=10)
DimPlot(integrated.cluster, group.by = "CellType", reduction = "umap",label = TRUE, label.size = 2)
dev.off()



feature_all <- c("INS","GCG","CHGA","LAMA1","SST","SCG2","GHRL")
pdf(file="~/Li/figures/feature_all.pdf")
FeaturePlot(integrated.cluster, features = feature_all)
dev.off()

Idents(integrated.cluster) <- "seurat_clusters" 
pdf(file="~/Li/figures/cluster0.pdf")
VlnPlot(integrated.cluster, features = "INS")
dev.off()

pdf(file="~/Li/figures/cluster1.pdf")
VlnPlot(integrated.cluster, features = "GCG")
dev.off()

pdf(file="~/Li/figures/cluster2.pdf")
VlnPlot(integrated.cluster, features = "CHGA")
dev.off()

pdf(file="~/Li/figures/cluster3.pdf")
VlnPlot(integrated.cluster, features = "SCG2")
dev.off()

pdf(file="~/Li/figures/cluster4.pdf")
VlnPlot(integrated.cluster, features = "LAMA1")
dev.off()

pdf(file="~/Li/figures/cluster5.pdf")
VlnPlot(integrated.cluster, features = "SST")
dev.off()

pdf(file="~/Li/figures/cluster6.pdf")
VlnPlot(integrated.cluster, features = "GHRL")
dev.off()

天不怕地不怕只怕代码

关注

2
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Li——01

library(Seurat)library(dplyr)library(ggplot2)library(cowplot)library(rlang)library(clustree)library(patchwork)library(ggraph)library(ggsci)library(DoubletFinder)1、Initialize Dataraw.s03.data <- Read10X("~/Li/s03",gene.column=1)raw.s04.data
复制链接

扫一扫