④分析胃癌组蛋白脱乙酰酶HDS模型-Seurat V5标准流程

hx2024

于 2024-08-13 01:20:30 发布

阅读量244

点赞数 6

分类专栏：文献复现文章标签： python windows 开发语言

本文链接：https://blog.csdn.net/hx2024/article/details/141142960

版权

文献复现专栏收录该内容

5 篇文章 0 订阅

订阅专栏

编辑6）过滤指标1:最少表达基因数的细胞 and 最少表达细胞数的基因

3）UMAP/TSNE可视化Harmony

前面有数据的整理和创建Seurat V4数据整理，数据的获取方式一致

②分析胃癌组蛋白脱乙酰酶HDS模型-单细胞数据整理-CSDN博客

Seurat V5数据整理读取

rm(list = ls()) 
library(dplyr)
library(readr)
library(BiocParallel)
library(Seurat)
library(sctransform)#加载包
library(data.table)#读取大文件提升速度使用

#getwd()
#setwd("路径设置")
#getwd()

①数据读取UMI

#文件过大，使用fread函数读取，行是基因，列是细胞
gcmatrix <- fread("raw_counts.csv",
                  sep = ",",header=T)#读取表达矩阵文件(注意第一列行名有cell)
#str(gcmatrix)#‘data.table’:25287 obs. of  111141 variables:
gcmatrix <- as.data.frame(gcmatrix)
rownames(gcmatrix) <- gcmatrix$Gene #基因为行名
gcmatrix <- gcmatrix[,-1]#去除基因列
gcmatrix[1:5,1:5]#展示前几行和几列：内容也是细胞和基因
tail(rownames(gcmatrix),10)#查看最后几行行名：都是基因
dim(gcmatrix)#[1]  25287 111140 

#读取cell_metadata注释文件
gcmeta <- read_csv("cell_metadata.csv")
gcmeta[1:5,1:5]
meta <- gcmeta #数据备份
table(meta$Sample)#查看样本
table(meta$Type)#查看注释细胞类型
table(meta$Patient,meta$Tissue)#查看样本及癌与癌旁细胞数

#校正一下是否相关样本一致#
n_last <- 7 #设置末尾位数：meta数据和细胞名取共同处
meta$ID <- substr(colnames(gcmatrix), nchar(colnames(gcmatrix)) - n_last + 1, 
                  nchar(colnames(gcmatrix)))
identical(meta$Sample,meta$ID)#[1] TRUE  数据是一一对应的
meta1 <- meta[,c(1:6)]#行名还需要为细胞名
meta1 <- as.data.frame(meta1)
rownames(meta1) <- colnames(gcmatrix)

②构建Seurat V5

#进行Seurat对象创建  ?CreateSeuratObject查看
#注意meta数据：行对应的细胞名，列对应的相应注释信息
gcdata <- CreateSeuratObject(gcmatrix, 
                             meta.data = meta1,
                             min.cells = 3,
                             min.features = 200)#创建对象同时质控
dim(gcdata)#查看含有多少细胞
save(gcdata, file = "stepgcdata1.RData")##标准化后的文件

Seurat V5标准流程

胃癌单细胞数据集GSE163558复现(二)：Seurat V5标准流程 (qq.com)

①QC质控

在生成Seurat已经质控:nCount_RNA(每个细胞的UMI数目)和nFeature_RNA（每个细胞中检测到的基因数量）

1）计算线粒体比例

#数据质控
#在生成Seurat已经质控:nCount_RNA(每个细胞的UMI数目)和nFeature_RNA（每个细胞中检测到的基因数量）
#"percent_mito"（表示细胞中线粒体基因的比例）
#”percent_ribo“（核糖体基因比例）和”percent_hb“（红血细胞基因比例）
#计算线粒体基因比例
mito_genes <- rownames(sce.all)[grep("^MT-", rownames(sce.all),ignore.case = T)] #线粒体基因查看
print(mito_genes) #可能是13个线粒体基因，小鼠数据基因名为小写"^mt-"
#sce.all=PercentageFeatureSet(sce.all, "^MT-", col.name = "percent_mito")
sce.all <- PercentageFeatureSet(sce.all, features = mito_genes, col.name = "percent_mito")#线粒体基因计算
fivenum(sce.all@meta.data$percent_mito)#查看前几个线粒体比例

2）计算核糖体基因比例

#计算核糖体基因比例
ribo_genes <- rownames(sce.all)[grep("^Rp[sl]", rownames(sce.all),ignore.case = T)]#查看核糖体基因
print(ribo_genes)
sce.all <- PercentageFeatureSet(sce.all,  features = ribo_genes, col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)

3）计算红血细胞比例

#计算红血细胞基因比例
Hb_genes <- rownames(sce.all)[grep("^Hb[^(p)]", rownames(sce.all),ignore.case = T)]#查看红细胞基因
print(Hb_genes)
sce.all <- PercentageFeatureSet(sce.all,  features = Hb_genes,col.name = "percent_hb")
fivenum(sce.all@meta.data$percent_hb)
head(sce.all@meta.data)

4）查看metadata

dat <- sce.all@meta.data
head(dat)

5）可视化QC前计算比例

p1：nCount_RNA与nFeature_RNA

p2："percent_mito"，”percent_ribo“和”percent_hb“

p3：nCount_RNA与nFeature_RNA相关性分析

6）过滤指标1:最少表达基因数的细胞 and 最少表达细胞数的基因

一般来说，在CreateSeuratObject的时候已经是进行了这个过滤操作，如果后期看到了自己的单细胞降维聚类分群结果很诡异，就可以回过头来看质量控制环节，先走默认流程即可。

 if(F){
  selected_c <- WhichCells(sce.all, expression = nFeature_RNA > 500)
  selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA$counts > 0 ) > 3]
  sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c)
  dim(sce.all) 
  dim(sce.all.filt) 
}
sce.all.filt =  sce.all
# par(mar = c(4, 8, 2, 1))
# 这里的C 这个矩阵，有一点大，可以考虑随抽样 
C=subset(sce.all.filt,downsample=100)@assays$RNA$counts
dim(C)
C=Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100

most_expressed <- order(apply(C, 1, median), decreasing = T)[50:1]

pdf("TOP50_most_expressed_gene.pdf",width=14)
boxplot(as.matrix(Matrix::t(C[most_expressed, ])),
        cex = 0.1, las = 1, 
        xlab = "% total count per cell", 
        col = (scales::hue_pal())(50)[50:1], 
        horizontal = TRUE)
dev.off()
rm(C)

7）过滤指标2:线粒体/核糖体基因比例

#过滤指标：percent_mito < 25，percent_ribo > 3，percent_hb < 1
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 25)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 3)
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 1 )
length(selected_hb)
length(selected_ribo)
length(selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
dim(sce.all.filt)
table(sce.all.filt$orig.ident)
length(sce.all.filt$orig.ident)

8）可视化过滤后的情况

#可视化过滤后的情况：
feats <- c("nFeature_RNA", "nCount_RNA")
p1_filtered=VlnPlot(sce.all.filt, group.by = "Sample", features = feats, pt.size = 0, ncol = 2) + 
  NoLegend()
w=length(unique(sce.all.filt$Sample))/3+5;w 
#ggsave(filename="Vlnplot1_filtered.pdf",plot=p1_filtered,width = w,height = 5)

feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2_filtered=VlnPlot(sce.all.filt, group.by = "Sample", features = feats, pt.size = 0, ncol = 3) + 
  NoLegend()
w=length(unique(sce.all.filt$Sample))/2+5;w 
p1_filtered
p2_filtered
#ggsave(filename="Vlnplot2_filtered.pdf",plot=p2_filtered,width = w,height = 5)

②harmony整合去批次

细胞筛选之后，我们需要进行harmony整合。第一期我们在创建总的Seurat对象时，使用了merge函数对多个Seurat进行了简单的合并。merge只是按照行和列进行了合并，并未对数据进行其他处理。

1)Harmony去批次前

#Harmony去批次前
#数据标准化
sce.all.filt <- NormalizeData(sce.all.filt, 
                              normalization.method = "LogNormalize",
                              scale.factor = 1e4) 
#筛选高变基因
sce.all.filt <- FindVariableFeatures(sce.all.filt)
p4 <- VariableFeaturePlot(sce.all.filt) 
p4
#数据归一化
sce.all.filt <- ScaleData(sce.all.filt)
#PCA线性降维
sce.all.filt <- RunPCA(sce.all.filt, features = VariableFeatures(object = sce.all.filt))
##可视化PCA结果
#VizDimLoadings(sce.all.filt, dims = 1:2, reduction = "pca")
#DimPlot(sce.all.filt, reduction = "pca") + NoLegend()
#DimHeatmap(sce.all.filt, dims = 1:12, cells = 500, balanced = TRUE)

注意一定需要PCA进行相关的降维

Seurat-SCTransform与harmony整合学习_seurat的harmony报错-CSDN博客

两种不同的方法实现harmony的多个单细胞整合 | 生信菜鸟团 (bio-info-trainee.com)

2）Harmony去批次

#Harmony去批次：注意一定需要PCA降维
library(harmony)
seuratObj <- RunHarmony(sce.all.filt, "Sample")#这里需要换成自己的样本注释列
names(seuratObj@reductions)
#RunHarmony后，Harmony的结果已经在seuratObj的reductions中

3）UMAP/TSNE可视化Harmony

#可视化
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=F ) 
seuratObj <- RunTSNE(seuratObj, dims = 1:15, 
                     reduction = "harmony")

DimPlot(seuratObj,reduction = "tsne",label=F )