getwd()
setwd('~/Desktop/SingleCellDeepLearning/单细胞/10X/')
dir()
library(magrittr)
library(stringr)
options(stringsAsFactors = F)
options(as.is = T)
rawDf = read.table('GSE84465_GBM_All_data.csv.gz') # 表达谱
rawAnno = read.table("SraRunTable.txt",sep = ",", header = T) # 样本信息
# 浏览数据信息并处理
tail(rawDf[,1:4],10)
# hista 注释软件的到的结果表达谱会有 5 列信息
#且最后5行统计了整个计数过程没有使用到的reads
# 因此我们要将其删除
rawDf = rawDf[1:(nrow(rawDf)-5),]
colnames(rawDf)[1:5]
# 接下来处理注释信息
# ids 为临时变量
ids = rawAnno[,c("plate_id","Well","TISSUE","Patient_ID")]
paste0("X",ids$plate_id[1],".",ids$Well[1])
rownames(ids) = paste0("X",ids$plate_id,".",ids$Well)
identical(colnames(rawDf),rownames(ids))
# 筛选tumor cell
length(which(ids$TISSUE=="Tumor"))
group <- (which(ids$TISSUE=="Tumor")) %>% ids[.,] #筛选的是行
head(group)
df <- (which(ids$TISSUE=="Tumor")) %>% rawDf[,.] #筛选的是列
dim(df)
group <- data.frame(Patient_ID=group$Patient_ID,
row.names = rownames(group))
identical(colnames(df),rownames(group)) # 判断行名和列名是否一致
# 现在已经构建好所需要的数据了
# df 是 count 矩阵
# group 是对应的样本注释信息
table(group$Patient_ID)
library("Seurat")
# 构建 seurat 对象
# 这个函数 CreateSeuratObject 有多种多样的执行方式
# 只要知道seurat 对象的本质就是表达谱就行
# 由于单细胞的表达谱列是细胞,行是基因
# 如果只有一个病人的话就无所谓注释信息了
# 但是多个样本的情况下,就要用到样本的注释信息
# 这样才可以区分哪一个细胞是哪一个患者的
# 包括由 cellranger 得到的 3 个数据也是一样的
# 1.构建对象的时候进行初步过滤
scRNA = CreateSeuratObject(counts=df,
meta.data = group,
min.cells = 3, # 被选中的基因至少在 3 个细胞中表达
min.features = 50) #被选中的细胞至少表达了 50 种基因
# seurat对象中的 count 矩阵
head(scRNA@assays$RNA@counts[1:4,1:4])
# 对比一下原来的 count 矩阵
head(df[1:4,1:4])
# 发现零的位置变成了·,这里的原因就是节省内存
# 查看注释信息矩阵
head(scRNA@meta.data[1:4,1:4])
# 2.数据清洗
# 数据清洗原则
# 1. 不符合质量的
# 2. 过滤线粒体 DNA
# 3. 过滤外原DNA
# 在过滤前先计算
table(grepl("^MT-",rownames(scRNA)))
table(grepl("^ERCC-",rownames(scRNA)))
table(grepl("^RP[SL][[:digit:]]",rownames(scRNA)))
scRNA[["percent.mt"]] = PercentageFeatureSet(scRNA, pattern = "^MT-")
scRNA[["percent.ecc"]] = PercentageFeatureSet(scRNA, pattern = "^ERCC-")
scRNA[["percent.rp"]] = PercentageFeatureSet(scRNA, pattern = "^RP[SL][[:digit:]]")
dim(scRNA)
summary(scRNA[["nCount_RNA"]])
summary(scRNA[["nFeature_RNA"]])
summary(scRNA[["percent.mt"]] )
summary(scRNA[["percent.ecc"]] )
summary(scRNA[["percent.rp"]] )
# 结合可视化和 summary 自行筛选
plot1 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nCount_RNA")
plot2 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot4 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.ecc")
plot5 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.rp")
CombinePlots(plots = list(plot1, plot2,plot3, plot4,plot5))
scRNA = subset(scRNA, subset =percent.ecc<40)
#3. 查看方差最大的基因
scRNA<- NormalizeData(scRNA,normalization.method = "LogNormalize", scale.factor = 10000)
scRNA <- FindVariableFeatures(scRNA, selection.method = "vst", nfeatures = 1500)
#可视化
if(F){
col.num <- length(unique(scRNA@meta.data$Patient_ID))
library(ggplot2)
p1_1.1 <- VlnPlot(scRNA,
features = c("nFeature_RNA"),
group.by = "Patient_ID",
cols =rainbow(col.num)) +
theme(legend.position = "none") +
labs(tag = "A")
p1_1.1
p1_1.2 <- VlnPlot(scRNA,
features = c("nCount_RNA"),
group.by = "Patient_ID",
cols =rainbow(col.num)) +
theme(legend.position = "none")
p1_1.2
p1_1 <- p1_1.1 | p1_1.2
p1_1
VlnPlot(scRNA,
features = c("nFeature_RNA","nCount_RNA","percent.ERCC"))
#图B:nCount_RNA与对应的nFeature_RNA关系
p1_2 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
group.by = "Patient_ID",pt.size = 1.3) +
labs(tag = "B")
p1_2
FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.ERCC")
top10 <- head(VariableFeatures(scRNA), 10)
top10
plot1 <- VariableFeaturePlot(scRNA)
#标记top10 hvg
p1_3 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, size=2.5) +
theme(legend.position = c(0.1,0.8)) +
labs(tag = "C")
p1_3
#看看ERCC
ERCC <- rownames(scRNA)[grep("^ERCC-",rownames(scRNA))]
LabelPoints(plot = plot1, points = ERCC, repel = TRUE,
size=2.5,colour = "blue") +
theme(legend.position = c(0.1,0.8)) +
labs(tag = "C")
#可以直观看到ERCC均不是高变基因,而且部分的ERCC基因表达量确实很高
p1_1 | p1_2 | p1_3 #上图
}
# 4. 归一化 & PCA
#先进行归一化(正态分布)
scRNA <- ScaleData(scRNA, features = (rownames(scRNA)))
#储存到"scale.data"的slot里
GetAssayData(scRNA,slot="scale.data",assay="RNA")[1:8,1:4]
#对比下原来的count矩阵
GetAssayData(scRNA,slot="counts",assay="RNA")[1:8,1:4]
# 4. PCA
#PCA降维,利用之前挑选的hvg,可提高效率
scRNA <- RunPCA(scRNA, features = VariableFeatures(scRNA),seed.use=3)
#尝试了seed.use的不同取值发现图形只有四种变化(四个拐角),其中以seed.use=3为代表的一类与原文文献一致
DimPlot(scRNA, reduction = "pca", group.by="Patient_ID")
p2_1 <- DimPlot(scRNA, reduction = "pca", group.by="Patient_ID")+
labs(tag = "D")
DimPlot(scRNA, reduction = "pca", split.by = 'Patient_ID')
#挑选主成分,RunPCA默认保留了前50个
scRNA <- JackStraw(scRNA,reduction = "pca", dims=20)
scRNA <- ScoreJackStraw(scRNA,dims = 1:20)
#鉴定前20个主成分的稳定性?
p2_2 <- JackStrawPlot(scRNA,dims = 1:20, reduction = "pca") +
theme(legend.position="bottom") +
labs(tag = "E")
p2_3 <- ElbowPlot(scRNA, ndims=20, reduction="pca")
#结果显示可挑选前20个pc
p2_1 | p2_2 | p2_3
# 5. 聚类
#5.1 聚类
pc.num=1:20
#基于PCA数据
scRNA <- FindNeighbors(scRNA, dims = pc.num)
# dims参数,需要指定哪些pc轴用于分析;这里利用上面的分析,选择20
scRNA <- FindClusters(scRNA, resolution = 0.5)
table(scRNA@meta.data$seurat_clusters)
# tsne & umap
scRNA = RunTSNE(scRNA, dims = pc.num)
scRNA = RunUMAP(scRNA, dims = pc.num)
DimPlot(scRNA, reduction = "tsne",label=T,split.by = "Patient_ID" )
DimPlot(scRNA, reduction = "umap",label=T,split.by = "Patient_ID" )
# 6.差异基因分析
#进行差异分析,一般使用标准化数据
diff.wilcox = FindAllMarkers(scRNA)##默认使用wilcox方法挑选差异基因,大概4-5min
#Seurat可以帮助你定义cluster的差异表达,默认情况下,它定义单个cluster的阳性和阴性的
# marker(与其他细胞群比较)。FindAllMarkers可以执行这个过程。
#1簇与其他簇的差异,默认只返回上调的基因(即高表达),参数可改,only.pos = TRUE
#寻找cluster5与cluster0和3之间的差异marker
cluster5.markers <- FindMarkers(scRNA, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
#寻找每一个cluster里与其他所有细胞相比之后的差异marker:运行较久,test.use = "wilcox"
#检验方法默认wilcox,此外还有其他方法
#scRNA.markers <- FindAllMarkers(scRNA, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
##可视化marker基因,VlnPlot (展示在不同cluster里的表达可能分布),
#FeaturePlot(根据tSNE和PCA结果可视化基因表达)是最常用的可视化方法。你也可以用其他一
#些方法,例如:RidgePlot, CellScatter, DotPlot。
VlnPlot(scRNA, features = c("MS4A1", "CD79A"))
FeaturePlot(scRNA, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ"))
head(diff.wilcox)
dim(diff.wilcox)
library(tidyverse)
all.markers = diff.wilcox %>% select(gene, everything()) %>%
subset(p_val<0.05 & abs(diff.wilcox$avg_logFC) > 0.5)
#An adjusted P value < 0.05 and | log 2 [fold change (FC)] | > 0.5
#were considered the 2 cutoff criteria for identifying marker genes.
dim(all.markers)
summary(all.markers)
top10 = all.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
top10
top10 = CaseMatch(search = as.vector(top10$gene), match = rownames(scRNA))
top10
length(top10)
length(unique(sort(top10)))
p3_2 <- DoHeatmap(scRNA, features = top10, group.by = "seurat_clusters") +
labs(tag = "G")
p3_2
p3_1 | p3_2 #下图
### 6、拼图,比较----
p <- (p1_1 | p1_2 | p1_3 ) /
((p2_1| p2_2 | p2_3) /
(p3_1 | p3_2))
p
getwd()
setwd('~/Desktop/SingleCellDeepLearning/单细胞/10X/')
dir()
library(magrittr)
library(stringr)
options(stringsAsFactors = F)
options(as.is = T)
rawDf = read.table('GSE84465_GBM_All_data.csv.gz') # 表达谱
rawAnno = read.table("SraRunTable.txt",sep = ",", header = T) # 样本信息
# 浏览数据信息并处理
tail(rawDf[,1:4],10)
# hista 注释软件的到的结果表达谱会有 5 列信息
#且最后5行统计了整个计数过程没有使用到的reads
# 因此我们要将其删除
rawDf = rawDf[1:(nrow(rawDf)-5),]
colnames(rawDf)[1:5]
# 接下来处理注释信息
# ids 为临时变量
ids = rawAnno[,c("plate_id","Well","TISSUE","Patient_ID")]
paste0("X",ids$plate_id[1],".",ids$Well[1])
rownames(ids) = paste0("X",ids$plate_id,".",ids$Well)
identical(colnames(rawDf),rownames(ids))
# 筛选tumor cell
length(which(ids$TISSUE=="Tumor"))
group <- (which(ids$TISSUE=="Tumor")) %>% ids[.,] #筛选的是行
head(group)
df <- (which(ids$TISSUE=="Tumor")) %>% rawDf[,.] #筛选的是列
dim(df)
group <- data.frame(Patient_ID=group$Patient_ID,
row.names = rownames(group))
identical(colnames(df),rownames(group)) # 判断行名和列名是否一致
# 现在已经构建好所需要的数据了
# df 是 count 矩阵
# group 是对应的样本注释信息
table(group$Patient_ID)
library("Seurat")
# 构建 seurat 对象
# 这个函数 CreateSeuratObject 有多种多样的执行方式
# 只要知道seurat 对象的本质就是表达谱就行
# 由于单细胞的表达谱列是细胞,行是基因
# 如果只有一个病人的话就无所谓注释信息了
# 但是多个样本的情况下,就要用到样本的注释信息
# 这样才可以区分哪一个细胞是哪一个患者的
# 包括由 cellranger 得到的 3 个数据也是一样的
# 1.构建对象的时候进行初步过滤
scRNA = CreateSeuratObject(counts=df,
meta.data = group,
min.cells = 3, # 被选中的基因至少在 3 个细胞中表达
min.features = 50) #被选中的细胞至少表达了 50 种基因
# seurat对象中的 count 矩阵
head(scRNA@assays$RNA@counts[1:4,1:4])
# 对比一下原来的 count 矩阵
head(df[1:4,1:4])
# 发现零的位置变成了·,这里的原因就是节省内存
# 查看注释信息矩阵
head(scRNA@meta.data[1:4,1:4])
# 2.数据清洗
# 数据清洗原则
# 1. 不符合质量的
# 2. 过滤线粒体 DNA
# 3. 过滤外原DNA
# 在过滤前先计算
table(grepl("^MT-",rownames(scRNA)))
table(grepl("^ERCC-",rownames(scRNA)))
table(grepl("^RP[SL][[:digit:]]",rownames(scRNA)))
scRNA[["percent.mt"]] = PercentageFeatureSet(scRNA, pattern = "^MT-")
scRNA[["percent.ecc"]] = PercentageFeatureSet(scRNA, pattern = "^ERCC-")
scRNA[["percent.rp"]] = PercentageFeatureSet(scRNA, pattern = "^RP[SL][[:digit:]]")
dim(scRNA)
summary(scRNA[["nCount_RNA"]])
summary(scRNA[["nFeature_RNA"]])
summary(scRNA[["percent.mt"]] )
summary(scRNA[["percent.ecc"]] )
summary(scRNA[["percent.rp"]] )
# 结合可视化和 summary 自行筛选
plot1 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nCount_RNA")
plot2 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot4 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.ecc")
plot5 = FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.rp")
CombinePlots(plots = list(plot1, plot2,plot3, plot4,plot5))
scRNA = subset(scRNA, subset =percent.ecc<40)
#3. 查看方差最大的基因
scRNA<- NormalizeData(scRNA,normalization.method = "LogNormalize", scale.factor = 10000)
scRNA <- FindVariableFeatures(scRNA, selection.method = "vst", nfeatures = 1500)
#可视化
if(F){
col.num <- length(unique(scRNA@meta.data$Patient_ID))
library(ggplot2)
p1_1.1 <- VlnPlot(scRNA,
features = c("nFeature_RNA"),
group.by = "Patient_ID",
cols =rainbow(col.num)) +
theme(legend.position = "none") +
labs(tag = "A")
p1_1.1
p1_1.2 <- VlnPlot(scRNA,
features = c("nCount_RNA"),
group.by = "Patient_ID",
cols =rainbow(col.num)) +
theme(legend.position = "none")
p1_1.2
p1_1 <- p1_1.1 | p1_1.2
p1_1
VlnPlot(scRNA,
features = c("nFeature_RNA","nCount_RNA","percent.ERCC"))
#图B:nCount_RNA与对应的nFeature_RNA关系
p1_2 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
group.by = "Patient_ID",pt.size = 1.3) +
labs(tag = "B")
p1_2
FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.ERCC")
top10 <- head(VariableFeatures(scRNA), 10)
top10
plot1 <- VariableFeaturePlot(scRNA)
#标记top10 hvg
p1_3 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, size=2.5) +
theme(legend.position = c(0.1,0.8)) +
labs(tag = "C")
p1_3
#看看ERCC
ERCC <- rownames(scRNA)[grep("^ERCC-",rownames(scRNA))]
LabelPoints(plot = plot1, points = ERCC, repel = TRUE,
size=2.5,colour = "blue") +
theme(legend.position = c(0.1,0.8)) +
labs(tag = "C")
#可以直观看到ERCC均不是高变基因,而且部分的ERCC基因表达量确实很高
p1_1 | p1_2 | p1_3 #上图
}
# 4. 归一化 & PCA
#先进行归一化(正态分布)
scRNA <- ScaleData(scRNA, features = (rownames(scRNA)))
#储存到"scale.data"的slot里
GetAssayData(scRNA,slot="scale.data",assay="RNA")[1:8,1:4]
#对比下原来的count矩阵
GetAssayData(scRNA,slot="counts",assay="RNA")[1:8,1:4]
# 4. PCA
#PCA降维,利用之前挑选的hvg,可提高效率
scRNA <- RunPCA(scRNA, features = VariableFeatures(scRNA),seed.use=3)
#尝试了seed.use的不同取值发现图形只有四种变化(四个拐角),其中以seed.use=3为代表的一类与原文文献一致
DimPlot(scRNA, reduction = "pca", group.by="Patient_ID")
p2_1 <- DimPlot(scRNA, reduction = "pca", group.by="Patient_ID")+
labs(tag = "D")
DimPlot(scRNA, reduction = "pca", split.by = 'Patient_ID')
#挑选主成分,RunPCA默认保留了前50个
scRNA <- JackStraw(scRNA,reduction = "pca", dims=20)
scRNA <- ScoreJackStraw(scRNA,dims = 1:20)
#鉴定前20个主成分的稳定性?
p2_2 <- JackStrawPlot(scRNA,dims = 1:20, reduction = "pca") +
theme(legend.position="bottom") +
labs(tag = "E")
p2_3 <- ElbowPlot(scRNA, ndims=20, reduction="pca")
#结果显示可挑选前20个pc
p2_1 | p2_2 | p2_3
# 5. 聚类
#5.1 聚类
pc.num=1:20
#基于PCA数据
scRNA <- FindNeighbors(scRNA, dims = pc.num)
# dims参数,需要指定哪些pc轴用于分析;这里利用上面的分析,选择20
scRNA <- FindClusters(scRNA, resolution = 0.5)
table(scRNA@meta.data$seurat_clusters)
# tsne & umap
scRNA = RunTSNE(scRNA, dims = pc.num)
scRNA = RunUMAP(scRNA, dims = pc.num)
DimPlot(scRNA, reduction = "tsne",label=T,split.by = "Patient_ID" )
DimPlot(scRNA, reduction = "umap",label=T,split.by = "Patient_ID" )
# 6.差异基因分析
#进行差异分析,一般使用标准化数据
diff.wilcox = FindAllMarkers(scRNA)##默认使用wilcox方法挑选差异基因,大概4-5min
#Seurat可以帮助你定义cluster的差异表达,默认情况下,它定义单个cluster的阳性和阴性的
# marker(与其他细胞群比较)。FindAllMarkers可以执行这个过程。
#1簇与其他簇的差异,默认只返回上调的基因(即高表达),参数可改,only.pos = TRUE
#寻找cluster5与cluster0和3之间的差异marker
cluster5.markers <- FindMarkers(scRNA, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
#寻找每一个cluster里与其他所有细胞相比之后的差异marker:运行较久,test.use = "wilcox"
#检验方法默认wilcox,此外还有其他方法
#scRNA.markers <- FindAllMarkers(scRNA, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
##可视化marker基因,VlnPlot (展示在不同cluster里的表达可能分布),
#FeaturePlot(根据tSNE和PCA结果可视化基因表达)是最常用的可视化方法。你也可以用其他一
#些方法,例如:RidgePlot, CellScatter, DotPlot。
VlnPlot(scRNA, features = c("MS4A1", "CD79A"))
FeaturePlot(scRNA, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ"))
head(diff.wilcox)
dim(diff.wilcox)
library(tidyverse)
all.markers = diff.wilcox %>% select(gene, everything()) %>%
subset(p_val<0.05 & abs(diff.wilcox$avg_logFC) > 0.5)
#An adjusted P value < 0.05 and | log 2 [fold change (FC)] | > 0.5
#were considered the 2 cutoff criteria for identifying marker genes.
dim(all.markers)
summary(all.markers)
top10 = all.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
top10
top10 = CaseMatch(search = as.vector(top10$gene), match = rownames(scRNA))
top10
length(top10)
length(unique(sort(top10)))
p3_2 <- DoHeatmap(scRNA, features = top10, group.by = "seurat_clusters") +
labs(tag = "G")
p3_2
p3_1 | p3_2 #下图
### 6、拼图,比较----
p <- (p1_1 | p1_2 | p1_3 ) /
((p2_1| p2_2 | p2_3) /
(p3_1 | p3_2))
p
如果您觉得我的文章对您有帮助,请点赞+关注,可以的话打个赏奖励一杯星巴克(~ ̄(OO) ̄)ブ
Best Regards,
Yuan.SH;
School of Basic Medical Sciences,
Fujian Medical University,
Fuzhou, Fujian, China.
please contact with me via the following ways:
(a) e-mail :yuansh3354@163.com