004 文献实战 01

最新推荐文章于 2023-09-14 16:05:11 发布

TTS56

最新推荐文章于 2023-09-14 16:05:11 发布

阅读量472

点赞数 2

分类专栏：单细胞单细胞与深度学习

本文链接：https://blog.csdn.net/qq_40966210/article/details/114015432

版权

单细胞同时被 2 个专栏收录

19 篇文章 18 订阅

订阅专栏

单细胞与深度学习

9 篇文章 10 订阅

订阅专栏

getwd()
setwd('~/Desktop/SingleCellDeepLearning/单细胞/10X/')
dir()
library(magrittr)
library(stringr)
options(stringsAsFactors = F)
options(as.is = T)
rawDf = read.table('GSE84465_GBM_All_data.csv.gz') # 表达谱
rawAnno = read.table("SraRunTable.txt",sep = ",", header = T) # 样本信息
# 浏览数据信息并处理
tail(rawDf[,1:4],10)
# hista 注释软件的到的结果表达谱会有 5 列信息
#且最后5行统计了整个计数过程没有使用到的reads
# 因此我们要将其删除
rawDf = rawDf[1:(nrow(rawDf)-5),]
colnames(rawDf)[1:5]
# 接下来处理注释信息
# ids 为临时变量
ids = rawAnno[,c("plate_id","Well","TISSUE","Patient_ID")]
paste0("X",ids$plate_id[1],".",ids$Well[1])
rownames(ids) = paste0("X",ids$plate_id,".",ids$Well)
identical(colnames(rawDf),rownames(ids))
# 筛选tumor cell
length(which(ids$TISSUE=="Tumor"))
group <- (which(ids$TISSUE=="Tumor")) %>% ids[.,] #筛选的是行
head(group)
df <- (which(ids$TISSUE=="Tumor")) %>%  rawDf[,.] #筛选的是列
dim(df)
group <- data.frame(Patient_ID=group$Patient_ID,
                       row.names = rownames(group))
identical(colnames(df),rownames(group)) # 判断行名和列名是否一致
# 现在已经构建好所需要的数据了
# df 是 count 矩阵
# group 是对应的样本注释信息
table(group$Patient_ID)
library("Seurat") 
# 构建 seurat 对象
# 这个函数 CreateSeuratObject 有多种多样的执行方式
# 只要知道seurat 对象的本质就是表达谱就行
# 由于单细胞的表达谱列是细胞,行是基因
# 如果只有一个病人的话就无所谓注释信息了
# 但是多个样本的情况下,就要用到样本的注释信息
# 这样才可以区分哪一个细胞是哪一个患者的
# 包括由 cellranger 得到的 3 个数据也是一样的

#  1.构建对象的时候进行初步过滤
scRNA = CreateSeuratObject(counts=df,
                           meta.data = group,
                           min.cells = 3, # 被选中的基因至少在 3 个细胞中表达
                           min.features = 50) #被选中的细胞至少表达了 50 种基因
# seurat对象中的 count 矩阵
head(scRNA@assays$RNA@counts[1:4,1:4])
# 对比一下原来的 count 矩阵
head(df[1:4,1:4])
# 发现零的位置变成了·,这里的原因就是节省内存
# 查看注释信息矩阵
head(scRNA@meta.data[1:4,1:4])

# 2.数据清洗
# 数据清洗原则 
# 1. 不符合质量的
# 2. 过滤线粒体 DNA
# 3. 过滤外原DNA
# 在过滤前先计算
table(grepl("^MT-",rownames(scRNA)))
table(grepl("^ERCC-",rownames(scRNA)))
table(grepl("^RP[SL][[:digit:]]",rownames(scRNA)))
scRNA[["percent.mt"]]  = PercentageFeatureSet(scRNA, pattern = "^MT-")
scRNA[["percent.ecc"]] = PercentageFeatureSet(scRNA, pattern = "^ERCC-")
scRNA[["percent.rp"]]  = PercentageFeatureSet(scRNA, pattern = "^RP[SL][[:digit:]]")
dim(scRNA)
summary(scRNA[["nCount_RNA"]])
summary(scRNA[["nFeature_RNA"]])
summary(scRNA[["percent.mt"]]  )
summary(scRNA[["percent.ecc"]] )
summary(scRNA[["percent.rp"]]  )
# 结合可视化和 summary 自行筛选
plot1 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nCount_RNA")
plot2 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot4 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.ecc")
plot5 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.rp")
CombinePlots(plots = list(plot1, plot2,plot3, plot4,plot5))

scRNA = subset(scRNA, subset =percent.ecc<40)


#3. 查看方差最大的基因
scRNA<- NormalizeData(scRNA,normalization.method = "LogNormalize", scale.factor = 10000)
scRNA <- FindVariableFeatures(scRNA, selection.method = "vst", nfeatures = 1500) 
#可视化
if(F){
  col.num <- length(unique(scRNA@meta.data$Patient_ID))
  library(ggplot2)
  
  p1_1.1 <- VlnPlot(scRNA,
                    features = c("nFeature_RNA"),
                    group.by = "Patient_ID",
                    cols =rainbow(col.num)) +
    theme(legend.position = "none") +
    labs(tag = "A")
  p1_1.1
  p1_1.2 <- VlnPlot(scRNA,
                    features = c("nCount_RNA"),
                    group.by = "Patient_ID",
                    cols =rainbow(col.num)) +
    theme(legend.position = "none") 
  p1_1.2
  p1_1 <- p1_1.1 | p1_1.2
  p1_1
  VlnPlot(scRNA,
          features = c("nFeature_RNA","nCount_RNA","percent.ERCC"))
  #图B：nCount_RNA与对应的nFeature_RNA关系
  p1_2 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
                         group.by = "Patient_ID",pt.size = 1.3) +
    labs(tag = "B")
  p1_2
  FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.ERCC")
  top10 <- head(VariableFeatures(scRNA), 10) 
  top10
  plot1 <- VariableFeaturePlot(scRNA) 
  #标记top10 hvg
  p1_3 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, size=2.5) +
    theme(legend.position = c(0.1,0.8)) +
    labs(tag = "C")
  p1_3
  #看看ERCC
  ERCC <- rownames(scRNA)[grep("^ERCC-",rownames(scRNA))]
  LabelPoints(plot = plot1, points = ERCC, repel = TRUE, 
              size=2.5,colour = "blue") +
    theme(legend.position = c(0.1,0.8)) +
    labs(tag = "C")
  #可以直观看到ERCC均不是高变基因，而且部分的ERCC基因表达量确实很高
  p1_1 | p1_2 | p1_3 #上图
  
}

# 4. 归一化 & PCA
#先进行归一化（正态分布）
scRNA <- ScaleData(scRNA, features = (rownames(scRNA)))
#储存到"scale.data"的slot里
GetAssayData(scRNA,slot="scale.data",assay="RNA")[1:8,1:4]
#对比下原来的count矩阵
GetAssayData(scRNA,slot="counts",assay="RNA")[1:8,1:4]
# 4. PCA
#PCA降维，利用之前挑选的hvg，可提高效率
scRNA <- RunPCA(scRNA, features = VariableFeatures(scRNA),seed.use=3)
#尝试了seed.use的不同取值发现图形只有四种变化（四个拐角），其中以seed.use=3为代表的一类与原文文献一致
DimPlot(scRNA, reduction = "pca", group.by="Patient_ID")
p2_1 <- DimPlot(scRNA, reduction = "pca", group.by="Patient_ID")+
  labs(tag = "D")

DimPlot(scRNA, reduction = "pca",  split.by = 'Patient_ID')

#挑选主成分，RunPCA默认保留了前50个
scRNA <- JackStraw(scRNA,reduction = "pca", dims=20)
scRNA <- ScoreJackStraw(scRNA,dims = 1:20)
#鉴定前20个主成分的稳定性?
p2_2 <- JackStrawPlot(scRNA,dims = 1:20, reduction = "pca") +
  theme(legend.position="bottom") +
  labs(tag = "E")

p2_3 <- ElbowPlot(scRNA, ndims=20, reduction="pca") 

#结果显示可挑选前20个pc
p2_1 | p2_2 | p2_3

# 5. 聚类
#5.1 聚类
pc.num=1:20
#基于PCA数据
scRNA <- FindNeighbors(scRNA, dims = pc.num) 
# dims参数，需要指定哪些pc轴用于分析；这里利用上面的分析，选择20
scRNA <- FindClusters(scRNA, resolution = 0.5)
table(scRNA@meta.data$seurat_clusters)
# tsne & umap
scRNA = RunTSNE(scRNA, dims = pc.num)
scRNA = RunUMAP(scRNA, dims = pc.num)
DimPlot(scRNA, reduction = "tsne",label=T,split.by = "Patient_ID" ) 
DimPlot(scRNA, reduction = "umap",label=T,split.by = "Patient_ID" ) 

# 6.差异基因分析
#进行差异分析，一般使用标准化数据
diff.wilcox = FindAllMarkers(scRNA)##默认使用wilcox方法挑选差异基因，大概4-5min
#Seurat可以帮助你定义cluster的差异表达，默认情况下，它定义单个cluster的阳性和阴性的
# marker（与其他细胞群比较）。FindAllMarkers可以执行这个过程。
#1簇与其他簇的差异，默认只返回上调的基因（即高表达），参数可改，only.pos = TRUE
#寻找cluster5与cluster0和3之间的差异marker
cluster5.markers <- FindMarkers(scRNA, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
#寻找每一个cluster里与其他所有细胞相比之后的差异marker:运行较久,test.use = "wilcox"
#检验方法默认wilcox，此外还有其他方法
#scRNA.markers <- FindAllMarkers(scRNA, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
##可视化marker基因，VlnPlot （展示在不同cluster里的表达可能分布），
#FeaturePlot（根据tSNE和PCA结果可视化基因表达）是最常用的可视化方法。你也可以用其他一
#些方法，例如：RidgePlot, CellScatter, DotPlot。
VlnPlot(scRNA, features = c("MS4A1", "CD79A"))
FeaturePlot(scRNA, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ"))

head(diff.wilcox)
dim(diff.wilcox)
library(tidyverse)
all.markers = diff.wilcox %>% select(gene, everything()) %>%
  subset(p_val<0.05 & abs(diff.wilcox$avg_logFC) > 0.5)
#An adjusted P value < 0.05  and | log 2 [fold change (FC)] | > 0.5 
#were considered the 2 cutoff criteria for identifying marker genes.
dim(all.markers)
summary(all.markers) 
top10 = all.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
top10
top10 = CaseMatch(search = as.vector(top10$gene), match = rownames(scRNA)) 
top10
length(top10)
length(unique(sort(top10)))

p3_2 <- DoHeatmap(scRNA, features = top10, group.by = "seurat_clusters") +
  labs(tag = "G")
p3_2
p3_1 | p3_2 #下图

### 6、拼图，比较----
p <- (p1_1 | p1_2 | p1_3 ) /
  ((p2_1| p2_2 | p2_3) /
     (p3_1 | p3_2))
p
getwd()
setwd('~/Desktop/SingleCellDeepLearning/单细胞/10X/')
dir()
library(magrittr)
library(stringr)
options(stringsAsFactors = F)
options(as.is = T)
rawDf = read.table('GSE84465_GBM_All_data.csv.gz') # 表达谱
rawAnno = read.table("SraRunTable.txt",sep = ",", header = T) # 样本信息
# 浏览数据信息并处理
tail(rawDf[,1:4],10)
# hista 注释软件的到的结果表达谱会有 5 列信息
#且最后5行统计了整个计数过程没有使用到的reads
# 因此我们要将其删除
rawDf = rawDf[1:(nrow(rawDf)-5),]
colnames(rawDf)[1:5]
# 接下来处理注释信息
# ids 为临时变量
ids = rawAnno[,c("plate_id","Well","TISSUE","Patient_ID")]
paste0("X",ids$plate_id[1],".",ids$Well[1])
rownames(ids) = paste0("X",ids$plate_id,".",ids$Well)
identical(colnames(rawDf),rownames(ids))
# 筛选tumor cell
length(which(ids$TISSUE=="Tumor"))
group <- (which(ids$TISSUE=="Tumor")) %>% ids[.,] #筛选的是行
head(group)
df <- (which(ids$TISSUE=="Tumor")) %>%  rawDf[,.] #筛选的是列
dim(df)
group <- data.frame(Patient_ID=group$Patient_ID,
                       row.names = rownames(group))
identical(colnames(df),rownames(group)) # 判断行名和列名是否一致
# 现在已经构建好所需要的数据了
# df 是 count 矩阵
# group 是对应的样本注释信息
table(group$Patient_ID)
library("Seurat") 
# 构建 seurat 对象
# 这个函数 CreateSeuratObject 有多种多样的执行方式
# 只要知道seurat 对象的本质就是表达谱就行
# 由于单细胞的表达谱列是细胞,行是基因
# 如果只有一个病人的话就无所谓注释信息了
# 但是多个样本的情况下,就要用到样本的注释信息
# 这样才可以区分哪一个细胞是哪一个患者的
# 包括由 cellranger 得到的 3 个数据也是一样的

#  1.构建对象的时候进行初步过滤
scRNA = CreateSeuratObject(counts=df,
                           meta.data = group,
                           min.cells = 3, # 被选中的基因至少在 3 个细胞中表达
                           min.features = 50) #被选中的细胞至少表达了 50 种基因
# seurat对象中的 count 矩阵
head(scRNA@assays$RNA@counts[1:4,1:4])
# 对比一下原来的 count 矩阵
head(df[1:4,1:4])
# 发现零的位置变成了·,这里的原因就是节省内存
# 查看注释信息矩阵
head(scRNA@meta.data[1:4,1:4])

# 2.数据清洗
# 数据清洗原则 
# 1. 不符合质量的
# 2. 过滤线粒体 DNA
# 3. 过滤外原DNA
# 在过滤前先计算
table(grepl("^MT-",rownames(scRNA)))
table(grepl("^ERCC-",rownames(scRNA)))
table(grepl("^RP[SL][[:digit:]]",rownames(scRNA)))
scRNA[["percent.mt"]]  = PercentageFeatureSet(scRNA, pattern = "^MT-")
scRNA[["percent.ecc"]] = PercentageFeatureSet(scRNA, pattern = "^ERCC-")
scRNA[["percent.rp"]]  = PercentageFeatureSet(scRNA, pattern = "^RP[SL][[:digit:]]")
dim(scRNA)
summary(scRNA[["nCount_RNA"]])
summary(scRNA[["nFeature_RNA"]])
summary(scRNA[["percent.mt"]]  )
summary(scRNA[["percent.ecc"]] )
summary(scRNA[["percent.rp"]]  )
# 结合可视化和 summary 自行筛选
plot1 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nCount_RNA")
plot2 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot4 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.ecc")
plot5 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.rp")
CombinePlots(plots = list(plot1, plot2,plot3, plot4,plot5))

scRNA = subset(scRNA, subset =percent.ecc<40)


#3. 查看方差最大的基因
scRNA<- NormalizeData(scRNA,normalization.method = "LogNormalize", scale.factor = 10000)
scRNA <- FindVariableFeatures(scRNA, selection.method = "vst", nfeatures = 1500) 
#可视化
if(F){
  col.num <- length(unique(scRNA@meta.data$Patient_ID))
  library(ggplot2)
  
  p1_1.1 <- VlnPlot(scRNA,
                    features = c("nFeature_RNA"),
                    group.by = "Patient_ID",
                    cols =rainbow(col.num)) +
    theme(legend.position = "none") +
    labs(tag = "A")
  p1_1.1
  p1_1.2 <- VlnPlot(scRNA,
                    features = c("nCount_RNA"),
                    group.by = "Patient_ID",
                    cols =rainbow(col.num)) +
    theme(legend.position = "none") 
  p1_1.2
  p1_1 <- p1_1.1 | p1_1.2
  p1_1
  VlnPlot(scRNA,
          features = c("nFeature_RNA","nCount_RNA","percent.ERCC"))
  #图B：nCount_RNA与对应的nFeature_RNA关系
  p1_2 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
                         group.by = "Patient_ID",pt.size = 1.3) +
    labs(tag = "B")
  p1_2
  FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.ERCC")
  top10 <- head(VariableFeatures(scRNA), 10) 
  top10
  plot1 <- VariableFeaturePlot(scRNA) 
  #标记top10 hvg
  p1_3 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, size=2.5) +
    theme(legend.position = c(0.1,0.8)) +
    labs(tag = "C")
  p1_3
  #看看ERCC
  ERCC <- rownames(scRNA)[grep("^ERCC-",rownames(scRNA))]
  LabelPoints(plot = plot1, points = ERCC, repel = TRUE, 
              size=2.5,colour = "blue") +
    theme(legend.position = c(0.1,0.8)) +
    labs(tag = "C")
  #可以直观看到ERCC均不是高变基因，而且部分的ERCC基因表达量确实很高
  p1_1 | p1_2 | p1_3 #上图
  
}

# 4. 归一化 & PCA
#先进行归一化（正态分布）
scRNA <- ScaleData(scRNA, features = (rownames(scRNA)))
#储存到"scale.data"的slot里
GetAssayData(scRNA,slot="scale.data",assay="RNA")[1:8,1:4]
#对比下原来的count矩阵
GetAssayData(scRNA,slot="counts",assay="RNA")[1:8,1:4]
# 4. PCA
#PCA降维，利用之前挑选的hvg，可提高效率
scRNA <- RunPCA(scRNA, features = VariableFeatures(scRNA),seed.use=3)
#尝试了seed.use的不同取值发现图形只有四种变化（四个拐角），其中以seed.use=3为代表的一类与原文文献一致
DimPlot(scRNA, reduction = "pca", group.by="Patient_ID")
p2_1 <- DimPlot(scRNA, reduction = "pca", group.by="Patient_ID")+
  labs(tag = "D")

DimPlot(scRNA, reduction = "pca",  split.by = 'Patient_ID')

#挑选主成分，RunPCA默认保留了前50个
scRNA <- JackStraw(scRNA,reduction = "pca", dims=20)
scRNA <- ScoreJackStraw(scRNA,dims = 1:20)
#鉴定前20个主成分的稳定性?
p2_2 <- JackStrawPlot(scRNA,dims = 1:20, reduction = "pca") +
  theme(legend.position="bottom") +
  labs(tag = "E")

p2_3 <- ElbowPlot(scRNA, ndims=20, reduction="pca") 

#结果显示可挑选前20个pc
p2_1 | p2_2 | p2_3

# 5. 聚类
#5.1 聚类
pc.num=1:20
#基于PCA数据
scRNA <- FindNeighbors(scRNA, dims = pc.num) 
# dims参数，需要指定哪些pc轴用于分析；这里利用上面的分析，选择20
scRNA <- FindClusters(scRNA, resolution = 0.5)
table(scRNA@meta.data$seurat_clusters)
# tsne & umap
scRNA = RunTSNE(scRNA, dims = pc.num)
scRNA = RunUMAP(scRNA, dims = pc.num)
DimPlot(scRNA, reduction = "tsne",label=T,split.by = "Patient_ID" ) 
DimPlot(scRNA, reduction = "umap",label=T,split.by = "Patient_ID" ) 

# 6.差异基因分析
#进行差异分析，一般使用标准化数据
diff.wilcox = FindAllMarkers(scRNA)##默认使用wilcox方法挑选差异基因，大概4-5min
#Seurat可以帮助你定义cluster的差异表达，默认情况下，它定义单个cluster的阳性和阴性的
# marker（与其他细胞群比较）。FindAllMarkers可以执行这个过程。
#1簇与其他簇的差异，默认只返回上调的基因（即高表达），参数可改，only.pos = TRUE
#寻找cluster5与cluster0和3之间的差异marker
cluster5.markers <- FindMarkers(scRNA, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
#寻找每一个cluster里与其他所有细胞相比之后的差异marker:运行较久,test.use = "wilcox"
#检验方法默认wilcox，此外还有其他方法
#scRNA.markers <- FindAllMarkers(scRNA, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
##可视化marker基因，VlnPlot （展示在不同cluster里的表达可能分布），
#FeaturePlot（根据tSNE和PCA结果可视化基因表达）是最常用的可视化方法。你也可以用其他一
#些方法，例如：RidgePlot, CellScatter, DotPlot。
VlnPlot(scRNA, features = c("MS4A1", "CD79A"))
FeaturePlot(scRNA, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ"))

head(diff.wilcox)
dim(diff.wilcox)
library(tidyverse)
all.markers = diff.wilcox %>% select(gene, everything()) %>%
  subset(p_val<0.05 & abs(diff.wilcox$avg_logFC) > 0.5)
#An adjusted P value < 0.05  and | log 2 [fold change (FC)] | > 0.5 
#were considered the 2 cutoff criteria for identifying marker genes.
dim(all.markers)
summary(all.markers) 
top10 = all.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
top10
top10 = CaseMatch(search = as.vector(top10$gene), match = rownames(scRNA)) 
top10
length(top10)
length(unique(sort(top10)))

p3_2 <- DoHeatmap(scRNA, features = top10, group.by = "seurat_clusters") +
  labs(tag = "G")
p3_2
p3_1 | p3_2 #下图

### 6、拼图，比较----
p <- (p1_1 | p1_2 | p1_3 ) /
  ((p2_1| p2_2 | p2_3) /
     (p3_1 | p3_2))
p

如果您觉得我的文章对您有帮助，请点赞+关注，可以的话打个赏奖励一杯星巴克(～￣(OO)￣)ブ

Best Regards,  
Yuan.SH;
School of Basic Medical Sciences,  
Fujian Medical University,  
Fuzhou, Fujian, China.  
please contact with me via the following ways:  
(a) e-mail :yuansh3354@163.com

TTS56

关注

2
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
004 文献实战 01

getwd()setwd('~/Desktop/SingleCellDeepLearning/单细胞/10X/')dir()library(magrittr)library(stringr)options(stringsAsFactors = F)options(as.is = T)rawDf = read.table('GSE84465_GBM_All_data.csv.gz') # 表达谱rawAnno = read.table("SraRunTable.txt",sep = ",",
复制链接

扫一扫