单细胞测序的入门操作

最新推荐文章于 2024-04-28 20:33:41 发布

yuxiang&chenxi

最新推荐文章于 2024-04-28 20:33:41 发布

阅读量2.6k

点赞数 1

文章标签：数据库数据挖掘 r语言

本文链接：https://blog.csdn.net/doctor_yuxiang/article/details/125962425

版权

（一）数据读入

在GEO数据库中下载数据，分别有三种文件

1.barcodes 条形码，这是drop-seq进行细胞标记，每一个细胞都有自己的barcode

2.features 里面包含了基因的特征，ensemble id以及与之相对应的symle id

3.matrix 里每个细胞基因所对应的count数

library(SingleR)
library(dplyr)
library(Seurat)
library(patchwork)
highfat.data <- Read10X(data.dir = "../../file location")
###通过CreateSeuratObject()创建一个Seurat对象，其包含矩阵数据和各类分析(如Data count 和PCA以及Cluster分析的结果
highfat<- CreateSeuratObject(counts = highfat.data , project = "highfat" , min.cells = 3 , min.features = 200)
###  所有基因都在三个以上细胞表达，每个细胞最少200个基因
View(highfat@meta.data)

(二) 数据的前处理主要包括数据的清洗，主要是去除低质量的细胞

依据QC质控进行过滤和选择细胞，数据归一化和缩放以及高变化基因特征的选择
一.QC和细胞的选择
1.常用的QC标准是检测每个细胞中的Unique基因的数目，
a.低质量或者空的Droplet 通常含有很少的基因
b.双个或者多个细胞通常含有异常高的基因

2.每个细胞内总的分子数目也和Unique基因相似

3.线粒体基因组的占比
a.低质量或者死细胞里表现出异常多的线粒体基因组
b.通过PercentageFeatureSet()函数来计算每个特征基因集的占比
c.通过MT-开头来标注线粒体基因组，mt小鼠，MT人类

### 通过小提琴图来可视化QC
highfat[["percent.mt"]] <- PercentageFeatureSet(highfat, pattern = "^mt-")
VlnPlot(highfat, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(highfat, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(highfat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
###数据过滤，去除基因小于200，大于2500的细胞，线粒体占比小于5
highfat <- subset(highfat, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
###去掉了低质量的细胞之后，接下来就是数据的归一化
###主要采用的是局部缩放的LogNormalize 的方法对每个细胞的总表达量乘以缩放因子(默认10000)对每个基因进行归一化。
highfat <- NormalizeData(highfat, normalization.method = "LogNormalize", scale.factor = 10000)
### 寻找高变基因
highfat <- FindVariableFeatures(highfat, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(highfat), 10)
plot1 <- VariableFeaturePlot(highfat)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

（三）数据的合并

###在合并之前要将他们赋予不同组别的信息
highfat$group<-"highfat"
control$group<-"group"
?FindIntegrationAnchors
?IntegrateData
##Find a set of anchors between a list of Seurat objects. These anchors can later be used to integrate the objects using the IntegrateData function.
##Perform dataset integration using a pre-computed AnchorSet.
diet.anchors <- FindIntegrationAnchors(object.list = list(control, highfat), dims = 1:10)
diet.combined <- IntegrateData(anchorset = diet.anchors, dims = 1:10)
DefaultAssay(diet.combined) <- "integrated"
# Run the standard workflow for visualization and clustering
diet.combined <- ScaleData(diet.combined, verbose = FALSE)
diet.combined <- RunPCA(diet.combined, npcs = 30, verbose = FALSE)
# t-SNE and Clustering
diet.combined <- RunUMAP(diet.combined, reduction = "pca", dims = 1:10)
diet.combined <- FindNeighbors(diet.combined, reduction = "pca", dims = 1:10)
diet.combined <- FindClusters(diet.combined, resolution = 0.3)
diet.combined
###取前七个簇
diet<-subset(diet.combined,seurat_clusters %in% c(0:7))
p1 <- DimPlot(diet.combined, reduction = "umap", group.by = "group")
p2 <- DimPlot(diet.combined, reduction = "umap", label = TRUE)
plot_grid(p1, p2)
DimPlot(diet.combined, reduction = "umap", split.by = "group")

(四)差异分析

###Colors single cells on a dimensional reduction plot according to a 'feature' (i.e. gene expression, PC scores, number of genes detected, etc.)
FeaturePlot(diet.combined, features = c("Ccr2", "Lyve1", "Timd4", "Cd9", "Plac8", "Lyz1", "Ear2", 
                                          "Napsa", "Cd209a"), min.cutoff = "q9")
FeaturePlot(diet.combined, features = c("Ccr2", "Lyve1", "Timd4"), split.by = "group", max.cutoff = 3, 
            cols = c("grey", "red"))
plots <- VlnPlot(diet.combined, features = c("Ccr2", "Lyve1", "Timd4","Lamp2"), split.by = "group", 
                 pt.size = 0, combine = FALSE)
CombinePlots(plots = plots, ncol = 1)
### 分组
new.cluster.ids <- c("0", "1", "2", "3", "4", "5", 
                     "6", "7")
names(new.cluster.ids)<-levels(diet.combined)
diet.combined<-RenameIdents(diet.combined,new.cluster.ids)
DimPlot(diet.combined,reduction = "umap",label = TRUE,pt.size = 0.5)+NoLegend()
DimPlot(sce,reduction = "umap",label = TRUE) 
unique(Idents(sce))
sce$celltype = Idents(sce)
### 寻找差异基因
mydeg <- FindMarkers(diet,ident.1 = '0_control',ident.2 = '0_highfat', verbose = FALSE, test.use = 'wilcox',min.pct = 0.1)
top10 <- mydeg  %>% top_n(n = 10, wt = avg_log2FC) %>% row.names()
cg_markers_df=mydeg[abs(mydeg$avg_log2FC) >1,]